In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [3]:
df = pd.read_csv("gender-classifier-DFE-791531.csv", encoding='latin1')

In [4]:
# Utility function taken from: https://github.com/rasto2211/Twitter-User-Gender-Classification/blob/master/notebooks/exploration.ipynb
# Normalizes text for analysis by removing URLs, special characters, and double spaces

def normalize_text(text):
    # Remove non-ASCII chars.
    text = re.sub('[^\x00-\x7F]+',' ', text)
    
    # Remove URLs
    text = re.sub('https?:\/\/.*[\r\n]*', ' ', text)
    
    # Remove special chars.
    text = re.sub('[?!+%{}:;.,"\'()\[\]_]', '',text)
    
    # Remove double spaces.
    text = re.sub('\s+',' ',text)
    return text

df['edited_text'] = [normalize_text(text) for text in df['text']]
#print(df['edited_text'])

In [12]:
# Choose data only where gender is either male or female and gender classification confidence is about 1
chosen_rows = df[df["gender"].isin(["male", "female"]) & (df["gender:confidence"] > 0.99)].index.tolist()

# Shuffle data to ensure randomness
random.shuffle(chosen_rows)

# Data Guidelines according to Canvas
n = len(chosen_rows)
train_data_size = .6
test_data_size = .2
validation_data_size = .2

# Partition chosen_rows
train_data_nrows = round(train_data_size * n)
train_data = chosen_rows[:train_data_nrows]
validation_data_upper_limit = (train_data_nrows + round(validation_data_size * n))
validation_data = chosen_rows[train_data_nrows : validation_data_upper_limit]
test_data = chosen_rows[validation_data_upper_limit:]


train_data_nrows

6012

In [6]:
# Our own MNB implementation
train_data_1 = df.ix[train_data, :]["edited_text"]
validation_data_1 = df.ix[validation_data, :]["edited_text"]
vectorizer_1 = CountVectorizer().fit_transform(train_data_1)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


In [7]:
# Making classifier
vectorizer = CountVectorizer()
#train_counts = vectorizer.fit_transform(df.ix[train_data, :]["edited_text"])
vectorizer = vectorizer.fit(df.ix[train_data, :]["edited_text"])
x_train = vectorizer.transform(df.ix[train_data, "edited_text"])
encoder = LabelEncoder()
y_train = encoder.fit_transform(df.loc[train_data, "gender"])
#print(x_train)
#print(y_train)

In [8]:
nb = MultinomialNB()
nb = nb.fit(x_train, y_train)
x_val = vectorizer.transform(df.ix[validation_data, "edited_text"])
y_val = encoder.transform(df.ix[validation_data, "gender"])
print(classification_report(y_val, nb.predict(x_val), target_names=encoder.classes_))
print(f"accuracy score: {accuracy_score(y_val, nb.predict(x_val))}")

             precision    recall  f1-score   support

     female       0.58      0.76      0.66      1059
       male       0.59      0.39      0.47       945

avg / total       0.58      0.58      0.57      2004

accuracy score: 0.5823353293413174


In [9]:
print(chosen_rows)

[19707, 9664, 19378, 11820, 2891, 9214, 12765, 17098, 7800, 6191, 4338, 1786, 15145, 16529, 5239, 3229, 9908, 12532, 11839, 15370, 16901, 7363, 3982, 2587, 16503, 403, 7030, 15768, 14378, 19138, 234, 4770, 9831, 697, 3412, 15080, 2210, 12743, 15703, 19093, 9092, 5447, 10289, 9079, 2922, 15050, 1264, 17139, 1700, 11542, 2546, 1878, 17068, 19642, 14410, 4952, 18963, 15852, 19471, 7734, 16626, 9234, 17714, 5495, 6552, 5873, 4174, 16221, 19042, 18162, 558, 17736, 6185, 17438, 14407, 9747, 11537, 11346, 7737, 213, 14834, 729, 2707, 18036, 13861, 11746, 11205, 13256, 6154, 1447, 10573, 1953, 7441, 3714, 18604, 16882, 13727, 19715, 747, 6390, 724, 3876, 1823, 13820, 14963, 3270, 8296, 4439, 3958, 13371, 13943, 17453, 13067, 1237, 13617, 3727, 7001, 12120, 9494, 14326, 1222, 3047, 2288, 13391, 1422, 3253, 112, 14421, 19784, 1077, 5982, 18381, 6948, 9849, 9338, 6916, 17224, 377, 19153, 3598, 12585, 15422, 14885, 11723, 13726, 5809, 2437, 11794, 14721, 8660, 15525, 14035, 3520, 13960, 7473, 1133