# Group 58 ADA Gender Classification Assignment



### Importing the necessary libraries

In [1]:
#!pip install xgboost
#!pip install -U scikit-learn

import pandas as pd
import xgboost
import numpy as np
import re
import string
from xml.etree import ElementTree
import matplotlib.pyplot as plt
%matplotlib inline 
import nltk
import time
from nltk.tokenize import RegexpTokenizer
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from nltk.stem import SnowballStemmer
from sklearn.semi_supervised import LabelPropagation
from sklearn.metrics import confusion_matrix

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shrey\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Reading train and test data sets

In [2]:
train = pd.read_csv("train_labels.csv")
test = pd.read_csv("test_labels.csv")

In [3]:
train.head()

Unnamed: 0,id,gender
0,d7d392835f50664fc079f0f388e147a0,male
1,ee40b86368137b86f51806c9f105b34b,female
2,919bc742d9a22d65eab1f52b11656cab,male
3,15b97a08d65f22d97ca685686510b6ae,female
4,affa98421ef5c46ca7c8f246e0a134c1,female


In [4]:
test.head()

Unnamed: 0,id,gender
0,d6b08022cdf758ead05e1c266649c393,male
1,9a989cb04766d5a89a65e8912d448328,female
2,2a1053a059d58fbafd3e782a8f7972c0,male
3,6032537900368aca3d1546bd71ecabd1,male
4,d191280655be8108ec9928398ff5b563,male


### Reading IDs from train and test to extract comments from the XML files

In [5]:
%%time
training_data_comment = []
training_data_gender = train['gender'].tolist()
training_data_ids = []
for ids in train['id']:
    
    gender = str(train[train["id"] == ids]['gender'])    
    
    file_name = ".\\data\\" + str(ids) + ".xml"
    
    xml_obj = ElementTree.parse(file_name)
    
    comments = xml_obj.findall('documents/document')
    comment_list = []
    for com in comments:
        comment_list.append(com.text)
    comm = ".".join(comment_list)
    training_data_comment.append(comm)
    training_data_ids.append(ids)

Wall time: 28.2 s


In [6]:
%%time
test_data_comment = []
test_data_gender = test['gender'].tolist()
test_data_ids = []
for ids in test['id']:    
    
    file_name = ".\\data\\" + str(ids) + ".xml"
    
    xml_obj = ElementTree.parse(file_name)
    
    test_comments = xml_obj.findall('documents/document')
    test_comment_list = []
    
    for com in test_comments:
        test_comment_list.append(com.text)
        
    comm = ".".join(test_comment_list)
    
    test_data_comment.append(comm)

    test_data_ids.append(ids)

Wall time: 3.8 s


In [7]:
train = {'id': training_data_ids, 'tweets': training_data_comment, 'gender': training_data_gender}
train = pd.DataFrame(train)
test = {'id': test_data_ids, 'tweets': test_data_comment, 'gender': test_data_gender}
test = pd.DataFrame(test)

In [8]:
train.head()

Unnamed: 0,id,tweets,gender
0,d7d392835f50664fc079f0f388e147a0,@CSIFERROSCAN youch! Good things to know! Is t...,male
1,ee40b86368137b86f51806c9f105b34b,Donald the Menace #ThanksComey https://t.co/j...,female
2,919bc742d9a22d65eab1f52b11656cab,This seems super sketch / too good to be true:...,male
3,15b97a08d65f22d97ca685686510b6ae,Just some texts with my dad about our Saturday...,female
4,affa98421ef5c46ca7c8f246e0a134c1,Irrevocably love this talented human and so pr...,female


In [9]:
test.head()

Unnamed: 0,id,tweets,gender
0,d6b08022cdf758ead05e1c266649c393,@JJMSports what odds he stops whining and goes...,male
1,9a989cb04766d5a89a65e8912d448328,Bingay!!!! I won a cool handy tonight #cashmon...,female
2,2a1053a059d58fbafd3e782a8f7972c0,The cynical manipulation of voters' desire for...,male
3,6032537900368aca3d1546bd71ecabd1,@9NowAU cannot convert b to object... on Sony ...,male
4,d191280655be8108ec9928398ff5b563,Cat Is a Kneading Maniac – Floppycats https://...,male


### EDA

In [10]:
gender_counts = train['gender'].value_counts()
gender_counts

male      1552
female    1548
Name: gender, dtype: int64

###### The number of male and female columns are almost the same. We can go ahead without trimming the data

### Preprocessing

In [11]:
# creating a list of nltk stop words

stopwords_list = stopwords.words('english')
stopwords_list[:5]

['i', 'me', 'my', 'myself', 'we']

In [12]:
# creating a list of alphabets

alphabet_string = string.ascii_lowercase
alphabet_string = list(alphabet_string)
alphabet_string[:5]

['a', 'b', 'c', 'd', 'e']

In [13]:
# adding the two lists to create a list of stop words

stopwords_list = stopwords_list + alphabet_string
print(stopwords_list[:5])  # stop words from nltk
print(stopwords_list[-5:])  # stop words (length 1)

['i', 'me', 'my', 'myself', 'we']
['v', 'w', 'x', 'y', 'z']


###### TRAIN DATA

In [14]:
# converting to a list for ease in manipulation
tweets = train['tweets']

In [15]:
%%time

# using regular expressions to remove unwanted words
tweets = tweets.apply(lambda x: re.sub(r"http\S+|www\S+"," ",x))
# converting to lower case
tweets = tweets.apply(lambda x: x.lower())

# creating a tokeniser to split the words
tokenizer = RegexpTokenizer(r'\w+')
tweets = tweets.apply(lambda x: tokenizer.tokenize(x))

# converting all non alpha values to none
tweets = [[v for v in tweet if v.isalpha()]for tweet in tweets]

# lemmatising the words
lemmatizer = WordNetLemmatizer()
tweets = [[lemmatizer.lemmatize(token) for token in tweet] for tweet in tweets]

# removing the stopwords from the list created above
tweets = [[token for token in tweet if token not in stopwords_list] for tweet in tweets]

# creating a corpus for train
train_corpus = []
for item in tweets:
    item = ' '.join(item)
    train_corpus.append(item)

# forming them to text from list of words and updating the train data frame
train['tweets'] = tweets
train['tweets'] = train['tweets'].apply(lambda x: " ".join(x))

Wall time: 21.9 s


In [16]:
train.head()

Unnamed: 0,id,tweets,gender
0,d7d392835f50664fc079f0f388e147a0,csiferroscan youch good thing know sort stuff ...,male
1,ee40b86368137b86f51806c9f105b34b,donald menace thankscomey return national grea...,female
2,919bc742d9a22d65eab1f52b11656cab,seems super sketch good true legit invisible r...,male
3,15b97a08d65f22d97ca685686510b6ae,text dad saturday night plan westernbulldogs t...,female
4,affa98421ef5c46ca7c8f246e0a134c1,irrevocably love talented human proud tote sla...,female


###### TEST DATA

In [17]:
# converting to a list for ease in manipulation
tweets = test['tweets']

In [18]:
%%time

# using regular expressions to remove unwanted words
tweets = tweets.apply(lambda x: re.sub(r"http\S+|www\S+"," ",x))
# converting to lower case
tweets = tweets.apply(lambda x: x.lower())

# creating a tokeniser to split the words
tokenizer = RegexpTokenizer(r'\w+')
tweets = tweets.apply(lambda x: tokenizer.tokenize(x))

# converting all non alpha values to none
tweets = [[v for v in tweet if v.isalpha()]for tweet in tweets]

# lemmatising the words
lemmatizer = WordNetLemmatizer()
tweets = [[lemmatizer.lemmatize(token) for token in tweet] for tweet in tweets]

# removing the stopwords from the list created above
tweets = [[token for token in tweet if token not in stopwords_list] for tweet in tweets]

# creating a corpus for test
test_corpus = []
for item in tweets:
    item = ' '.join(item)
    test_corpus.append(item)


# forming them to text from list of words and updating the test data frame
test['tweets'] = tweets
test['tweets'] = test['tweets'].apply(lambda x: " ".join(x))

Wall time: 3.71 s


In [19]:
test.head()

Unnamed: 0,id,tweets,gender
0,d6b08022cdf758ead05e1c266649c393,jjmsports odds stop whining go get proper job ...,male
1,9a989cb04766d5a89a65e8912d448328,bingay cool handy tonight cashmoney howboutdat...,female
2,2a1053a059d58fbafd3e782a8f7972c0,cynical manipulation voter desire honest gover...,male
3,6032537900368aca3d1546bd71ecabd1,cannot convert object sony braavia happening w...,male
4,d191280655be8108ec9928398ff5b563,cat kneading maniac floppycats left go hysteri...,male


### CREATE VECTORIZER

In [20]:
# building the tfidf vectorizer
vectorizer = TfidfVectorizer(analyzer='word',input='content',
                           lowercase=False,                   # make the text lowercase
                           min_df=0.05,                      # remove words appearing in less than 5% of the documents
                           max_df=0.95,                      # remove words appearing in more than 95% of the documents
                           use_idf=True,
                           ngram_range=(1,2),               # creates bigrams and unigrams
                           max_features = None)

In [21]:
# creating the tfidf for train set
tfidftrain = vectorizer.fit_transform(train_corpus)
tfidftrain = pd.DataFrame(tfidftrain.todense(), columns=vectorizer.get_feature_names())
tfidftrain

Unnamed: 0,able,absolute,absolutely,accept,access,according,account,across,act,acting,...,yesterday,yet,yo,york,young,youtube,yr,yup,zealand,zero
0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,...,0.037598,0.058260,0.000000,0.000000,0.041463,0.000000,0.000000,0.0,0.000000,0.000000
1,0.000000,0.000000,0.000000,0.0,0.035843,0.0,0.000000,0.000000,0.0,0.0,...,0.027057,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
2,0.000000,0.000000,0.034440,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.051348
3,0.000000,0.000000,0.035248,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,...,0.000000,0.026568,0.053083,0.000000,0.037816,0.000000,0.000000,0.0,0.047066,0.000000
4,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3095,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.044991,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.050015,0.0,0.000000,0.000000
3096,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.053801,0.0,0.0,...,0.000000,0.035318,0.000000,0.000000,0.000000,0.049284,0.000000,0.0,0.000000,0.000000
3097,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.030658,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
3098,0.045574,0.111121,0.043611,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.062552,0.000000,0.000000,0.000000,0.0,0.000000,0.000000


In [22]:
# fitting the same vectorizer on test data
tfidftest = vectorizer.transform(test_corpus)
tfidftest = pd.DataFrame(tfidftest.todense(), columns=vectorizer.get_feature_names())
tfidftest

Unnamed: 0,able,absolute,absolutely,accept,access,according,account,across,act,acting,...,yesterday,yet,yo,york,young,youtube,yr,yup,zealand,zero
0,0.000000,0.034532,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,...,0.026369,0.000000,0.0,0.0,0.029079,0.0,0.000000,0.0,0.000000,0.000000
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,...,0.033952,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.046601,0.000000
2,0.031501,0.000000,0.000000,0.043292,0.000000,0.0,0.0,0.000000,0.0,0.000000,...,0.000000,0.000000,0.0,0.0,0.032340,0.0,0.038477,0.0,0.000000,0.000000
3,0.034773,0.000000,0.000000,0.000000,0.042884,0.0,0.0,0.000000,0.0,0.000000,...,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.052633,...,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.098492,0.054988
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,0.000000,0.000000,0.027640,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,...,0.000000,0.000000,0.0,0.0,0.029653,0.0,0.035280,0.0,0.110720,0.000000
496,0.000000,0.000000,0.031322,0.000000,0.000000,0.0,0.0,0.035963,0.0,0.000000,...,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000
497,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,...,0.033119,0.025660,0.0,0.0,0.000000,0.0,0.000000,0.0,0.045458,0.000000
498,0.000000,0.000000,0.028710,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,...,0.000000,0.064918,0.0,0.0,0.030801,0.0,0.000000,0.0,0.000000,0.000000


In [23]:
# total number of features
len(vectorizer.get_feature_names())

1817

### MODEL BUILDING LOGISTIC REGRESSION

In [24]:
# building a logistic regression classifier
classifier = LogisticRegression()
classifier.fit(tfidftrain, train["gender"])

LogisticRegression()

In [25]:
# creating prediction and true label lists to compare result
predicted_labels = classifier.predict(tfidftest)
true_labels = test.pop('gender')
test['gender'] = predicted_labels

In [26]:
# creating an encoder for the categorical value output to calculate accuracy, precision and recall
le = LabelEncoder()
predicted_labels = le.fit_transform(predicted_labels)
true_labels = le.fit_transform(true_labels)

In [27]:
# printing the confusion matrix, accuracy, Precision and recall
confusion_matrix_no_tuning = confusion_matrix(true_labels, predicted_labels) 
  
print ("Confusion Matrix : \n", confusion_matrix_no_tuning)
print("testing score is:",accuracy_score(true_labels, predicted_labels)*100,'%')
print("Precision:",precision_score(true_labels, predicted_labels))
print("Recall:",recall_score(true_labels, predicted_labels))

Confusion Matrix : 
 [[194  58]
 [ 41 207]]
testing score is: 80.2 %
Precision: 0.7811320754716982
Recall: 0.8346774193548387


### HYPER PARAMETER TUNING

In [28]:
# creating a list of values for C i.e., lambda in this case
param_grid = [{"C": np.logspace(-4,4,1000)
             }]

In [29]:
# creating a grid search to find optimum value of lambda
clf_wt = GridSearchCV(classifier, param_grid = param_grid, verbose=True,cv=2, n_jobs=-1)
classifier_logistic_grid = clf_wt.fit(tfidftrain,train["gender"])

Fitting 2 folds for each of 1000 candidates, totalling 2000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    9.7s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   18.5s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   31.9s
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 2000 out of 2000 | elapsed:  4.4min finished


In [30]:
# calculating predictions after tuning
predicted_labels_wt = classifier_logistic_grid.best_estimator_.predict(tfidftest)

In [31]:
# printing the best parameter
classifier_logistic_grid.best_estimator_

LogisticRegression(C=1.9243509752303323)

In [32]:
# transforming the labels for calculating the confusion matrix, accuracy, precision and recall
predicted_labels_wt = le.fit_transform(predicted_labels_wt)

In [33]:
# printing the confusion matrix, accuracy, precision and recall
confusion_matrix_with_tuning = confusion_matrix(true_labels, predicted_labels_wt) 
  
print ("Confusion Matrix : \n", confusion_matrix_with_tuning)
print("testing score is:",accuracy_score(true_labels, predicted_labels_wt)*100,'%')
print("Precision:",precision_score(true_labels, predicted_labels_wt)*100,"%")
print("Recall:",recall_score(true_labels, predicted_labels_wt)*100,"%")

Confusion Matrix : 
 [[197  55]
 [ 40 208]]
testing score is: 81.0 %
Precision: 79.08745247148289 %
Recall: 83.87096774193549 %


### CREATING PREDICTION CSV

In [34]:
# converting encoded values to actual values
text_labels = []
for val in predicted_labels_wt:
    if val == 1:
        text_labels.append('male')
    else:
        text_labels.append('female')
        
text_labels[:5]

['male', 'female', 'male', 'male', 'male']

In [35]:
# creating list for ids
ids = test['id']

# converting lists to data frame
df_encoded = pd.DataFrame(list(zip(ids, predicted_labels_wt)), columns = ['id', 'gender'])
df = pd.DataFrame(list(zip(ids, text_labels)), columns = ['id', 'gender'])

# converting to CSV
# df_encoded.to_csv('pred_labels.csv', header = True, index = False)
df.to_csv('pred_labels.csv', header = True, index = False)

In [36]:
df.head()

Unnamed: 0,id,gender
0,d6b08022cdf758ead05e1c266649c393,male
1,9a989cb04766d5a89a65e8912d448328,female
2,2a1053a059d58fbafd3e782a8f7972c0,male
3,6032537900368aca3d1546bd71ecabd1,male
4,d191280655be8108ec9928398ff5b563,male
