In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
data = pd.read_csv(r"gender_classifier.csv",encoding = "latin1")
data = pd.concat([data.gender, data.description],axis=1)
data.dropna(axis=0,inplace=True)
data.gender = [1 if each == "female" else 0 for each in data.gender]

In [3]:
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    return text

data['clean_description'] = data['description'].apply(clean_text)

In [4]:
first_description = data.description[4]
description = re.sub("[^a-zA-Z]"," ",first_description)
description = description.lower() #buyuk harften kucuk harfe cevirme

In [10]:
import spacy

nlp = spacy.load("en_core_web_sm")

def lemmatize_text(text):
    doc = nlp(text)
    lemmatized_tokens = [token.lemma_ for token in doc]
    return ' '.join(lemmatized_tokens)

# Lemmatizasyon işlemi
data['lemmatized_description'] = data['clean_description'].apply(lemmatize_text)


In [11]:
from spacy.lang.en.stop_words import STOP_WORDS

custom_stop_words = set(STOP_WORDS)

def remove_stop_words(text):
    tokens = text.split()
    filtered_tokens = [token for token in tokens if token not in custom_stop_words]
    return ' '.join(filtered_tokens)

data['no_stop_words_description'] = data['lemmatized_description'].apply(remove_stop_words)


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF vektörizasyonu
tfidf_vectorizer = TfidfVectorizer(max_features=1500)
tfidf_matrix = tfidf_vectorizer.fit_transform(data['no_stop_words_description'])

# TF-IDF matrisi üzerindeki öznitelik isimlerini alma
feature_names = tfidf_vectorizer.get_feature_names_out()

# TF-IDF matrisini DataFrame'e dönüştürme (isteğe bağlı)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

# Bağımlı ve bağımsız değişkenlerin ayrıştırılması
X = tfidf_matrix
y = data['gender']

# Eğitim ve test setlerine ayırma
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# RandomForestClassifier modelinin oluşturulması ve eğitimi
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Test seti üzerinde tahmin yapma
y_pred = rf_classifier.predict(X_test)

# Model performansını değerlendirme
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.72      0.83      0.77      2132
           1       0.55      0.39      0.46      1113

    accuracy                           0.68      3245
   macro avg       0.64      0.61      0.61      3245
weighted avg       0.66      0.68      0.66      3245



In [14]:
results_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(results_df.head(10))


       Actual  Predicted
5545        1          0
3233        0          0
1938        0          0
988         1          0
17823       0          0
13503       0          0
3494        1          0
16761       1          1
2011        0          0
94          0          0
