In [1]:
import os
import zipfile
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
# Load dataset
dataset_zip = os.path.join("data", "dataset.zip")
with zipfile.ZipFile(dataset_zip, 'r') as zip:
    zip.extractall("data")

csv_path = os.path.join("data", "gender-classifier-DFE-791531.csv")
df = pd.read_csv(csv_path, sep=",", encoding="ISO-8859-1")
print(f"Dataset size: {df.shape[0]} rows")
print(f"Columns: {list(df.columns)}")
#print(df.head())

Dataset size: 20050 rows
Columns: ['_unit_id', '_golden', '_unit_state', '_trusted_judgments', '_last_judgment_at', 'gender', 'gender:confidence', 'profile_yn', 'profile_yn:confidence', 'created', 'description', 'fav_number', 'gender_gold', 'link_color', 'name', 'profile_yn_gold', 'profileimage', 'retweet_count', 'sidebar_color', 'text', 'tweet_coord', 'tweet_count', 'tweet_created', 'tweet_id', 'tweet_location', 'user_timezone']


In [3]:
# Split dataset for test and train sets
df = df.dropna(subset=["fav_number", "tweet_count", "retweet_count", "gender", "name", "description"])
df = df[df.gender.isin(["male", "female"])]

df["name_len"] = df["name"].str.len()
df["desc_len"] = df["description"].str.len()
df = df.drop(["name", "description"], axis=1)

train_df, test_df = train_test_split(df)
train_x = train_df[["fav_number", "retweet_count", "tweet_count"]]
test_x = test_df[["fav_number", "retweet_count", "tweet_count"]]
train_y = train_df["gender"].astype("category")
test_y = test_df["gender"].astype("category")

print(f"Training data rows: {train_df.shape[0]}")
print(f"Test data rows: {test_df.shape[0]}")

Training data rows: 8395
Test data rows: 2799


In [4]:
# Train model
model = RandomForestClassifier()
model.fit(train_x, train_y)
prediction = model.predict(test_x)
print(prediction)
acc = accuracy_score(test_y, prediction)
print(f"Accuracy: {acc:.2f}")

['female' 'female' 'female' ... 'female' 'female' 'female']
Accuracy: 0.55
