In [None]:
import pandas as pd
import os
from subprocess import call
from IPython.display import Image
from sklearn.tree import export_graphviz
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from dataset_reader import read_dataset

In [None]:
# Load dataset
df = read_dataset()
print(f"Dataset size: {df.shape[0]} rows")
print(f"Columns: {list(df.columns)}")

In [None]:
# Split dataset for test and train sets
df = df.dropna(subset=["fav_number", "tweet_count", "retweet_count", "gender", "name", "description"])
df = df[df.gender.isin(["male", "female"])]

df["name_len"] = df["name"].str.len()
df["desc_len"] = df["description"].str.len()
df = df.drop(["name", "description"], axis=1)

train_df, test_df = train_test_split(df)
feature_names = ["fav_number", "tweet_count", "retweet_count", "name_len", "desc_len"]
train_x = train_df[feature_names]
test_x = test_df[feature_names]
train_y = train_df["gender"].astype("category")
test_y = test_df["gender"].astype("category")

print(f"Training data rows: {train_df.shape[0]}")
print(f"Test data rows: {test_df.shape[0]}")

In [None]:
# Train model
model = RandomForestClassifier()
model.fit(train_x, train_y)
prediction = model.predict(test_x)
print(prediction)
acc = accuracy_score(test_y, prediction)
print(f"Accuracy: {acc:.2f}")

In [None]:
estimator = model.estimators_[5]

tmp_dir = os.path.join(os.getcwd(), 'tmp')
if not os.path.isdir(tmp_dir):
    os.mkdir(tmp_dir)
graph_file = os.path.join(tmp_dir, 'tree.dot')
png_file = os.path.join(tmp_dir, 'tree.png')

export_graphviz(estimator, out_file=graph_file, feature_names=feature_names,
                class_names=['male', 'female'], rounded=True, proportion=False, precision=2, filled=True)

call(['dot', '-Tpng', graph_file, '-o', png_file, '-Gdpi=600'])

Image(filename=png_file)