###Spacy Load

In [None]:
!python -m spacy download en_core_web_lg

In [None]:
!pip install turicreate

###Imports

In [None]:
import re
import matplotlib.pyplot as plt
import spacy
import turicreate as tc
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import operator

tqdm.pandas()
%matplotlib inline

#Fxp Forum Data
[Leet](https://en.wikipedia.org/wiki/Leet) Setup

In [None]:
origspace = "abcdefghijklmnopqrstuvwxyz"
keyspace = "@6(d3f9h1jklmn0pqr$7uvwxyz"

In [None]:
def encode(string):
    str_return = ""
    for word in string.split(" "):
        temp = ""
        for c in word:
            if c in keyspace:
                i = keyspace.index(c)
                temp += origspace[i]
            else:
                temp += c
        str_return += str(temp) + " "
    return str_return[:-1]

In [None]:
nlp = spacy.load('en_core_web_lg')

path_to_train = '/content/drive/MyDrive/Data Mining/Data sets/fxp_user.txt'

fxp_users = tc.SFrame.read_csv(path_to_train, delimiter='\t', header=False)
fxp_users = fxp_users.rename({'X1': 'user_id', 'X2': 'user_name', 'X3': 'gender'})

## Vector Method
Extract different features from the username and build word vectors using spacy

In [None]:
l = {'user_name': [], 'leet': [], 'gender': [], 'special_chars': [], 'is_number': [], 'vector': [], 'leet_vector': []}
for i in tqdm(range(len(fxp_users))):
    try:
        row = fxp_users[i]
        user_name = row["user_name"].lower()
        # Get the l33t format
        leet = encode(user_name)
        # Remove all non char characters from the user name
        user_name = re.sub(r'[^a-z ]', '', user_name)
        # Check is there are digits in the string
        is_number = any(c.isdigit() for c in user_name)
        # Check for special chars
        special_char = row["user_name"] != user_name
        l['leet'].append(leet)
        l['special_chars'].append(special_char)
        l['is_number'].append(is_number)
        l['user_name'].append(user_name)
        l['gender'].append(row["gender"])
        # get vectors of the user_name and l33t form as additional features
        l['vector'].append(nlp(user_name).vector)
        l['leet_vector'].append(nlp(leet).vector)
    except:
        pass

In [None]:
fxp_users = tc.SFrame(l)
fxp_users.materialize()
# Get all females samples
female = fxp_users[fxp_users['gender'] == 'f']

### Boosted trees and Random Forest

In [None]:
d_boosted = {}
d_random = {}
debug = False
# train on 10 different splits
for i in tqdm(range(10)):
    # Since the set is heavily biased towards 'male' labels, get a small
    # sample of them and train on a balanced dataset
    male, _ = fxp_users[fxp_users['gender'] == 'm'].random_split(0.12)
    train, test = male.append(female).shuffle().random_split(0.8)
    x = []
    y = []
    y2 = []
    boosted_trees_models = {}
    random_forest_models = {}
    # Each model will be trained in 3 different depths and the best model will be chosen
    for depth in range(10, 31, 10):
        model = tc.boosted_trees_classifier.create(train, target='gender',
                                                   max_iterations=10, max_depth=depth, verbose=False)
        res = model.evaluate(test)
        y.append(res['accuracy'])
        boosted_trees_models[depth] = (model, res)
        model = tc.random_forest_classifier.create(train, target='gender',
                                                   max_iterations=10, max_depth=depth, verbose=False)
        res = model.evaluate(test)
        x.append(depth)
        y2.append(res['accuracy'])
        random_forest_models[depth] = (model, res)
    # After training for 10 times, take the best model of each classifier
    best_boosted_depth = x[y.index(max(y))]
    best_boosted_model = boosted_trees_models[best_boosted_depth][0]
    best_random_depth = x[y2.index(max(y2))]
    best_random_model = random_forest_models[best_random_depth][0]
    d_boosted[i] = {'model': best_boosted_model, 'depth': best_boosted_depth,
                    'accuracy': boosted_trees_models[best_boosted_depth][1].get('accuracy')}
    d_random[i] = {'model': best_random_model, 'depth': best_random_depth,
                   'accuracy': random_forest_models[best_random_depth][1].get('accuracy')}
    if debug:
        print(f"best Boosted Trees Classifier has a depth of {best_boosted_depth} with accuracy of"
              f" {boosted_trees_models[best_boosted_depth][1].get('accuracy')}")
        print(f"best Random Forest Classifier has a depth of {best_random_depth} with accuracy of"
              f" {random_forest_models[best_random_depth][1].get('accuracy')}")

Plot the classifier performance on each iteration

In [None]:
x = list(range(1, 11))
pres_boost = [boosted['accuracy'] for boosted in d_boosted.values()]
pres_random = [random['accuracy'] for random in d_random.values()]
plt.plot(x, pres_boost, 'go-')
plt.plot(x, pres_random, 'bo-')
plt.xlabel('Iteration')
plt.ylabel('Highest accuracy')
plt.title("Classifier Comparison")
plt.legend(['Boosted Forst', 'Random Forest'])
plt.savefig('Classifier Comparison')
plt.show()

###SVM

In [None]:
svm_model = tc.svm_classifier.create(train, target='gender')
svm_res = svm_model.evaluate(test)
print(f"best svm model got a accuracy of {svm_res.get('accuracy')}")

### Choose the best classifier

In [None]:
best_boosted_model = d_boosted[pres_boost.index(max(pres_boost))].get('model')
best_random_model = d_random[pres_random.index(max(pres_random))].get('model')
models = [best_boosted_model, best_random_model, svm_model]
pres = [d_boosted[pres_boost.index(max(pres_boost))].get('accuracy'),
        d_random[pres_random.index(max(pres_random))].get('accuracy'),
        svm_res.get('accuracy')]
names = ['boosted', 'random', 'svm']
best_model_index = pres.index(max(pres))
print(f"the best model is: {names[best_model_index]} with accuracy of: {pres[best_model_index]}")
best_model = models[best_model_index]
best_model.save('best_model_vector_fxp')

Save it for later

In [None]:
!zip -r "/content/drive/MyDrive/Data Mining/Models/best_model_vector_fxp.zip" best_model_vector_fxp/

# Ngrams

In [None]:
# build ngrams of length 1-10 chars in the user_name
features = [f'words_{i}grams' for i in range(1, 10)]
for i in range(1, len(features) + 1):
    fxp_users[features[i-1]] = tc.text_analytics.count_ngrams(fxp_users['user_name'], n=i, method='character')
fxp_users.materialize()
fxp_users

In [None]:
#split to female
female = fxp_users[fxp_users['gender'] == 'f']
d_boosted = {}
d_random = {}
debug = False

### Boosted trees and Random Forest

In [None]:
for i in tqdm(range(10)):
    male, _ = fxp_users[fxp_users['gender'] == 'm'].random_split(0.12)
    train, test = male.append(female).shuffle().random_split(0.8)
    x = []
    y = []
    y2 = []
    boosted_trees_models = {}
    random_forest_models = {}
    for depth in range(10, 31, 10):
        model = tc.boosted_trees_classifier.create(train, target='gender',
                                                   features=features,
                                                   max_iterations=10, max_depth=depth, verbose=False)
        res = model.evaluate(test)
        y.append(res['accuracy'])
        boosted_trees_models[depth] = (model, res)
        model = tc.random_forest_classifier.create(train, target='gender',
                                                   features=features,
                                                   max_iterations=10, max_depth=depth, verbose=False)
        res = model.evaluate(test)
        x.append(depth)
        y2.append(res['accuracy'])
        random_forest_models[depth] = (model, res)
    best_boosted_depth = x[y.index(max(y))]
    best_boosted_model = boosted_trees_models[best_boosted_depth][0]
    best_random_depth = x[y2.index(max(y2))]
    best_random_model = random_forest_models[best_random_depth][0]
    d_boosted[i] = {'model': best_boosted_model, 'depth': best_boosted_depth,
                    'accuracy': boosted_trees_models[best_boosted_depth][1].get('accuracy')}
    d_random[i] = {'model': best_random_model, 'depth': best_random_depth,
                   'accuracy': random_forest_models[best_random_depth][1].get('accuracy')}
    if debug:
        print(f"best Boosted Trees Classifier has a depth of {best_boosted_depth} with accuracy of"
              f" {boosted_trees_models[best_boosted_depth][1].get('accuracy')}")
        print(f"best Random Forest Classifier has a depth of {best_random_depth} with accuracy of"
              f" {random_forest_models[best_random_depth][1].get('accuracy')}")

In [None]:
x = list(range(1, 11))
pres_boost = [boosted['accuracy'] for boosted in d_boosted.values()]
pres_random = [random['accuracy'] for random in d_random.values()]
plt.plot(x, pres_boost, 'go-')
plt.plot(x, pres_random, 'bo-')
plt.xlabel('Iteration')
plt.ylabel('Highest accuracy')
plt.title("Classifier Comparison")
plt.legend(['Boosted Forst', 'Random Forest'])
plt.savefig('Classifier Comparison')
plt.show()

### SVM

In [None]:
svm_model = tc.svm_classifier.create(train, target='gender', features=features)
svm_res = svm_model.evaluate(test)
print(f"best svm model got a accuracy of {svm_res.get('accuracy')}")

### Choose the best classifier

In [None]:
best_boosted_model = d_boosted[pres_boost.index(max(pres_boost))].get('model')
best_random_model = d_random[pres_random.index(max(pres_random))].get('model')
models = [best_boosted_model, best_random_model, svm_model]
pres = [d_boosted[pres_boost.index(max(pres_boost))].get('accuracy'),
        d_random[pres_random.index(max(pres_random))].get('accuracy'),
        svm_res.get('accuracy')]
names = ['boosted', 'random', 'svm']
best_model_index = pres.index(max(pres))
print(f"the best model is: {names[best_model_index]} with accuracy of: {pres[best_model_index]}")
best_model = models[best_model_index]
best_model.save('best_model_ngram_fxp')

Save for later use

In [None]:
!zip -r "/content/drive/MyDrive/Data Mining/Models/best_model_ngram_fxp.zip" best_model_ngram_fxp/

#Twitter Users

In [None]:
# Load the data
path_to_train = '/content/drive/MyDrive/Data Mining/Data sets/twitter_users.csv'
data = pd.read_csv(path_to_train, encoding="latin1")
data = pd.concat([data.gender, data.name], axis=1)
data.dropna(axis = 0, inplace = True) # we dropped the null rows
# Get only the 'male', 'female' labels from the data
dataf = data[data['gender'].isin(['male', 'female'])]
dataf.columns = ['gender', 'user_name']

twitter_users = tc.SFrame(dataf)
twitter_users

##Vector Method

In [None]:
l = {'user_name': [], 'leet': [], 'gender': [], 'special_chars': [], 'is_number': [], 'vector': [], 'leet_vector': []}
for i in tqdm(range(len(twitter_users))):
    try:
        row = twitter_users[i]
        user_name = row["name"].lower()
        leet = encode(user_name)
        user_name = re.sub(r'[^a-z ]', '', user_name)
        is_number = any(c.isdigit() for c in user_name)
        special_char = row["name"] != user_name
        l['leet'].append(leet)
        l['special_chars'].append(special_char)
        l['is_number'].append(is_number)
        l['user_name'].append(user_name)
        l['gender'].append(row["gender"])
        l['vector'].append(nlp(user_name).vector)
        l['leet_vector'].append(nlp(leet).vector)
    except:
        pass

In [None]:
twitter_users = tc.SFrame(l)
twitter_users.materialize()

### Boosted trees and Random Forest

In [None]:
d_boosted = {}
d_random = {}
debug = False
for i in tqdm(range(10)):
    train, test = twitter_users.random_split(0.8)
    x = []
    y = []
    y2 = []
    boosted_trees_models = {}
    random_forest_models = {}
    for depth in range(10, 31, 10):
        model = tc.boosted_trees_classifier.create(train, target='gender',
                                                   max_iterations=10, max_depth=depth, verbose=False)
        res = model.evaluate(test)
        y.append(res['accuracy'])
        boosted_trees_models[depth] = (model, res)
        model = tc.random_forest_classifier.create(train, target='gender',
                                                   max_iterations=10, max_depth=depth, verbose=False)
        res = model.evaluate(test)
        x.append(depth)
        y2.append(res['accuracy'])
        random_forest_models[depth] = (model, res)
    best_boosted_depth = x[y.index(max(y))]
    best_boosted_model = boosted_trees_models[best_boosted_depth][0]
    best_random_depth = x[y2.index(max(y2))]
    best_random_model = random_forest_models[best_random_depth][0]
    d_boosted[i] = {'model': best_boosted_model, 'depth': best_boosted_depth,
                    'accuracy': boosted_trees_models[best_boosted_depth][1].get('accuracy')}
    d_random[i] = {'model': best_random_model, 'depth': best_random_depth,
                   'accuracy': random_forest_models[best_random_depth][1].get('accuracy')}
    if debug:
        print(f"best Boosted Trees Classifier has a depth of {best_boosted_depth} with accuracy of"
              f" {boosted_trees_models[best_boosted_depth][1].get('accuracy')}")
        print(f"best Random Forest Classifier has a depth of {best_random_depth} with accuracy of"
              f" {random_forest_models[best_random_depth][1].get('accuracy')}")

In [None]:
x = list(range(1, 11))
pres_boost = [boosted['accuracy'] for boosted in d_boosted.values()]
pres_random = [random['accuracy'] for random in d_random.values()]
plt.plot(x, pres_boost, 'go-')
plt.plot(x, pres_random, 'bo-')
plt.xlabel('Iteration')
plt.ylabel('Highest accuracy')
plt.title("Classifier Comparison")
plt.legend(['Boosted Forst', 'Random Forest'])
plt.savefig('Classifier Comparison')
plt.show()

### SVM

In [None]:
svm_model = tc.svm_classifier.create(train, target='gender')
svm_res = svm_model.evaluate(test)
print(f"best svm model got a accuracy of {svm_res.get('accuracy')}")

### Choose the best classifier

In [None]:
best_boosted_model = d_boosted[pres_boost.index(max(pres_boost))].get('model')
best_random_model = d_random[pres_random.index(max(pres_random))].get('model')
models = [best_boosted_model, best_random_model, svm_model]
pres = [d_boosted[pres_boost.index(max(pres_boost))].get('accuracy'),
        d_random[pres_random.index(max(pres_random))].get('accuracy'),
        svm_res.get('accuracy')]
names = ['boosted', 'random', 'svm']
best_model_index = pres.index(max(pres))
print(f"the best model is: {names[best_model_index]} with accuracy of: {pres[best_model_index]}")
best_model = models[best_model_index]
best_model.save('best_model_vector_twitter')

In [None]:
!zip -r "/content/drive/MyDrive/Data Mining/Models/best_model_vector_twitter.zip" best_model_vector_twitter/

## NGram

In [None]:
features = [f'words_{i}grams' for i in range(1, 10)]
for i in range(1, len(features) + 1):
    twitter_users[features[i-1]] = tc.text_analytics.count_ngrams(twitter_users['user_name'], n=i, method='character')
twitter_users.materialize()

In [None]:
d_boosted = {}
d_random = {}
debug = False

### Boosted trees and Random Forest

In [None]:
for i in tqdm(range(10)):
    train, test = twitter_users.random_split(0.8)
    x = []
    y = []
    y2 = []
    boosted_trees_models = {}
    random_forest_models = {}
    for depth in range(10, 31, 10):
        model = tc.boosted_trees_classifier.create(train, target='gender',
                                                   features=features,
                                                   max_iterations=10, max_depth=depth, verbose=False)
        res = model.evaluate(test)
        y.append(res['accuracy'])
        boosted_trees_models[depth] = (model, res)
        model = tc.random_forest_classifier.create(train, target='gender',
                                                   features=features,
                                                   max_iterations=10, max_depth=depth, verbose=False)
        res = model.evaluate(test)
        x.append(depth)
        y2.append(res['accuracy'])
        random_forest_models[depth] = (model, res)
    best_boosted_depth = x[y.index(max(y))]
    best_boosted_model = boosted_trees_models[best_boosted_depth][0]
    best_random_depth = x[y2.index(max(y2))]
    best_random_model = random_forest_models[best_random_depth][0]
    d_boosted[i] = {'model': best_boosted_model, 'depth': best_boosted_depth,
                    'accuracy': boosted_trees_models[best_boosted_depth][1].get('accuracy')}
    d_random[i] = {'model': best_random_model, 'depth': best_random_depth,
                   'accuracy': random_forest_models[best_random_depth][1].get('accuracy')}
    if debug:
        print(f"best Boosted Trees Classifier has a depth of {best_boosted_depth} with accuracy of"
              f" {boosted_trees_models[best_boosted_depth][1].get('accuracy')}")
        print(f"best Random Forest Classifier has a depth of {best_random_depth} with accuracy of"
              f" {random_forest_models[best_random_depth][1].get('accuracy')}")

In [None]:
x = list(range(1, 11))
pres_boost = [boosted['accuracy'] for boosted in d_boosted.values()]
pres_random = [random['accuracy'] for random in d_random.values()]
plt.plot(x, pres_boost, 'go-')
plt.plot(x, pres_random, 'bo-')
plt.xlabel('Iteration')
plt.ylabel('Highest accuracy')
plt.title("Classifier Comparison")
plt.legend(['Boosted Forst', 'Random Forest'])
plt.savefig('Classifier Comparison')
plt.show()

### SVM

In [None]:
svm_model = tc.svm_classifier.create(train, target='gender', features=features)
svm_res = svm_model.evaluate(test)
print(f"best svm model got a accuracy of {svm_res.get('accuracy')}")

NN

In [None]:
nn_model = tc.nearest_neighbor_classifier.create(train, target='gender', features=features, distance='auto')
nn_res = nn_model.evaluate(test)
print(f"best nn model got a accuracy of {nn_res.get('accuracy')}")

### Choose the best classifier

In [None]:
best_boosted_model = d_boosted[pres_boost.index(max(pres_boost))].get('model')
best_random_model = d_random[pres_random.index(max(pres_random))].get('model')
models = [best_boosted_model, best_random_model, svm_model, nn_model]
pres = [d_boosted[pres_boost.index(max(pres_boost))].get('accuracy'),
        d_random[pres_random.index(max(pres_random))].get('accuracy'),
        svm_res.get('accuracy'), nn_res.get('accuracy')]
names = ['boosted', 'random', 'svm', 'nn']
best_model_index = pres.index(max(pres))
print(f"the best model is: {names[best_model_index]} with accuracy of: {pres[best_model_index]}")
best_model = models[best_model_index]
best_model.save('best_model_ngram_twitter')

In [None]:
!zip -r "/content/drive/MyDrive/Data Mining/Models/best_model_ngram_twitter.zip" best_model_ngram_twitter/

#Labeled users using names dict
Ngram

In [None]:
from turicreate import aggregate as agg
data = pd.read_csv('/content/drive/MyDrive/Data Mining/Data sets/training_data.csv', encoding="latin1")
data.columns = ['user_name', 'gender']
dict_users = tc.SFrame(data)
dict_users.groupby(key_column_names='gender',operations={'count': agg.COUNT()}) #we are ok here

### Features Extraction

In [None]:
features = [f'words_{i}grams' for i in range(1, 10)]
for i in range(1, len(features) + 1):
    dict_users[features[i-1]] = tc.text_analytics.count_ngrams(dict_users['user_name'], n=i, method='character')

### Boosted trees

In [None]:
train, test = dict_users.random_split(0.8)
boosted_model = tc.boosted_trees_classifier.create(train,features=features, target="gender")
boosted_res = boosted_model.evaluate(test)
print(f"best boosted model got a accuracy of {boosted_res.get('accuracy')}")

### Random Forest

In [None]:
random_model = tc.random_forest_classifier.create(train,features=features, target="gender")
random_res = boosted_model.evaluate(test)
print(f"best random model got a accuracy of {random_res.get('accuracy')}")

### SVM

In [None]:
svm_model = tc.svm_classifier.create(train, target='gender', features=features)
svm_res = svm_model.evaluate(test)
print(f"best svm model got a accuracy of {svm_res.get('accuracy')}")

### Choose the best classifier

In [None]:
models = [boosted_model, random_model, svm_model]
pres = [boosted_res.get('accuracy'),
        random_res.get('accuracy'),
        svm_res.get('accuracy')]
names = ['boosted', 'random', 'svm']
best_model_index = pres.index(max(pres))
print(f"the best model is: {names[best_model_index]} with accuracy of: {pres[best_model_index]}")
best_model = models[best_model_index]
best_model.save('best_model_ngram_train')

In [None]:
!zip -r "/content/drive/MyDrive/Data Mining/Models/best_model_ngram_train.zip" best_model_ngram_train/

# Ensemble
Get All 3 Best Models

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!cp "/content/drive/MyDrive/Data Mining/Models/best_model_ngram_train.zip" best_model_ngram_train.zip
!cp '/content/drive/MyDrive/Data Mining/Models/best_model_ngram_fxp.zip' best_model_ngram_fxp.zip
!cp '/content/drive/MyDrive/Data Mining/Models/best_model_ngram_twitter.zip' best_model_ngram_twitter.zip
!unzip -q best_model_ngram_train.zip
!unzip -q best_model_ngram_fxp.zip
!unzip -q best_model_ngram_twitter.zip

### Load The models

In [None]:
twitter_model = tc.load_model('best_model_ngram_twitter')
fxp_model = tc.load_model('best_model_ngram_fxp/')
train_model = tc.load_model('best_model_ngram_train/')

### Load Data and extract features

In [None]:
full_data = pd.read_csv('/content/drive/MyDrive/Data Mining/email_list_modified.csv')
# training data was already predicted upon
training_data = pd.read_csv('/content/drive/MyDrive/Data Mining/Data sets/training_data.csv', encoding='ISO-8859-1')
features = [f'words_{i}grams' for i in range(1, 10)]
temp_st = tc.SFrame(full_data['username'].map(str))
for i in range(1, len(features) + 1):
    temp_st[features[i - 1]] = tc.text_analytics.count_ngrams(temp_st['X1'], n=i, method='character')

### Predict 

In [None]:
full_data['fxp_predict'] = fxp_model.predict(temp_st[features])
full_data['twitter_predict'] = twitter_model.predict(temp_st[features])
full_data['train_predict'] = train_model.predict(temp_st[features])

### make all outputs the same

In [None]:
full_data['gender'] = full_data['gender_by_dict'].apply(lambda g: 'male' if g[0].lower() == 'm' else 'female' if g[0].lower() == 'f' else 'unknown')
full_data['fxp_predict'] = full_data['fxp_predict'].apply(lambda g: 'male' if g[0].lower() == 'm' else 'female')
full_data['twitter_predict'] = full_data['twitter_predict'].apply(lambda g: 'male' if g[0].lower() == 'm' else 'female')
full_data['train_predict'] = full_data['train_predict'].apply(lambda g: 'male' if g[0].lower() == 'm' else 'female')
training_data['gender'] = training_data['gender'].apply(lambda g: 'male' if g[0].lower() == 'm' else 'female' if g[0].lower() == 'f' else 'unknown')
training_data['fxp_predict'] = training_data['fxp_predict'].apply(lambda g: 'male' if g[0].lower() == 'm' else 'female')
training_data['twitter_predict'] = training_data['twitter_predict'].apply(lambda g: 'male' if g[0].lower() == 'm' else 'female')
training_data['train_predict'] = training_data['train_predict'].apply(lambda g: 'male' if g[0].lower() == 'm' else 'female')

Ensemble Majority Vote

In [None]:
def ensemble_label(row):
  d = {'male': 0, 'female': 0}
  d[row['fxp_predict']] += 1
  d[row['twitter_predict']] += 1
  d[row['train_predict']] += 1
  return max(d.items(), key=operator.itemgetter(1))[0]

In [None]:
full_data['ensemble_label'] = full_data.apply(ensemble_label, axis=1)
training_data['ensemble_label'] = training_data.apply(ensemble_label, axis=1)

# Test the ensemble learning
## Test on each classifier

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
labels = ['male', 'female']

### FXP classifier Confusion matrix

In [None]:
y_true = list(training_data['gender'])
y_pred = list(training_data['fxp_predict'])
cm = confusion_matrix(y_true, y_pred, labels)
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
plt.title('Confusion matrix of the classifier')
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
print(f'The accuracy of the classifier train of FXP is: {accuracy_score(y_true, y_pred)}')

### Twitter classifier Confusion matrix

In [None]:
y_true = list(training_data['gender'])
y_pred = list(training_data['twitter_predict'])
cm = confusion_matrix(y_true, y_pred, labels)
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
plt.title('Confusion matrix of the classifier')
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
print(f'The accuracy of the classifier train of Twitter is: {accuracy_score(y_true, y_pred)}')

### ensemble classifier Confusion matrix

In [None]:
y_true = list(training_data['gender'])
y_pred = list(training_data['ensemble_label'])
cm = confusion_matrix(y_true, y_pred, labels)
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
plt.title('Confusion matrix of the classifier')
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
print(f'The accuracy of the classifier train of ensemble is: {accuracy_score(y_true, y_pred)}')

#Okcupid user Name Set
###Load the data

In [None]:
okcupid_usernames = pd.read_csv('/content/drive/MyDrive/Data Mining/Data sets/okcupid_usernames.csv')
okcupid_usernames.columns = ['name', 'gender']
okcupid_usernames['gender'] = okcupid_usernames['gender'].apply(lambda gender: 'male' if gender == ' guy' else 'female')
okcupid_usernames['name'] = okcupid_usernames['name'].apply(lambda name: name.replace('-', ''))

### Extrct Features

In [None]:
features = [f'words_{i}grams' for i in range(1, 10)]
temp_st = tc.SFrame(okcupid_usernames['name'].map(str))
for i in range(1, len(features) + 1):
    temp_st[features[i - 1]] = tc.text_analytics.count_ngrams(temp_st['X1'], n=i, method='character')

### Predict with each model

In [None]:
okcupid_usernames['fxp_predict'] = fxp_model.predict(temp_st[features])
okcupid_usernames['twitter_predict'] = twitter_model.predict(temp_st[features])
okcupid_usernames['train_predict'] = train_model.predict(temp_st[features])

### Make the labels the same

In [None]:
okcupid_usernames['fxp_predict'] = okcupid_usernames['fxp_predict'].apply(lambda g: 'male' if g[0].lower() == 'm' else 'female')
okcupid_usernames['twitter_predict'] = okcupid_usernames['twitter_predict'].apply(lambda g: 'male' if g[0].lower() == 'm' else 'female')
okcupid_usernames['train_predict'] = okcupid_usernames['train_predict'].apply(lambda g: 'male' if g[0].lower() == 'm' else 'female')

### Ensemble

In [None]:
okcupid_usernames['ensemble_label'] = okcupid_usernames.apply(ensemble_label, axis=1)

### FXP classifier Confusion matrix

In [None]:
y_true = list(okcupid_usernames['gender'])
y_pred = list(okcupid_usernames['fxp_predict'])
cm = confusion_matrix(y_true, y_pred, labels)
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
plt.title('Confusion matrix of the classifier')
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
print(f'The accuracy of the classifier train of FXP is: {accuracy_score(y_true, y_pred)}')

### Twitter classifier Confusion matrix

In [None]:
y_true = list(okcupid_usernames['gender'])
y_pred = list(okcupid_usernames['twitter_predict'])
cm = confusion_matrix(y_true, y_pred, labels)
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
plt.title('Confusion matrix of the classifier')
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
print(f'The accuracy of the classifier train of Twitter is: {accuracy_score(y_true, y_pred)}')

### user-gender classifier Confusion matrix

In [None]:
y_true = list(okcupid_usernames['gender'])
y_pred = list(okcupid_usernames['train_predict'])
cm = confusion_matrix(y_true, y_pred, labels)
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
plt.title('Confusion matrix of the classifier')
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
print(f'The accuracy of the classifier train of names dict is: {accuracy_score(y_true, y_pred)}')

### ensemble classifier Confusion matrix

In [None]:
y_true = list(okcupid_usernames['gender'])
y_pred = list(okcupid_usernames['ensemble_label'])
cm = confusion_matrix(y_true, y_pred, labels)
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
plt.title('Confusion matrix of the classifier')
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
print(f'The accuracy of the classifier train of ensemble is: {accuracy_score(y_true, y_pred)}')

In [None]:
full_data.to_csv('/content/drive/MyDrive/Data Mining/full_data_tagged.csv')

In [None]:
full_data['label'] = full_data.apply(lambda row: row['gender'] if row['gender'] != 'unknown' else row['ensemble_label'], axis=1)
full_data[['username', 'gender', 'ensemble_label', 'label']]

In [None]:
full_data.to_csv('/content/drive/MyDrive/Data Mining/full_data_labeled.csv')