In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

In [2]:
#We first import the dataset
df= pd.read_csv(r'C:\Users\Javiera Vines\Documents\Projects\Predicting_age_and_gender\train.csv')

In [3]:
df.head(5)

Unnamed: 0,ID,keywords,age,sex
0,1,fibre:16;quoi:1;dangers:1;combien:1;hightech:1...,62,F
1,2,restaurant:1;marrakech.shtml:1,35,M
2,3,payer:1;faq:1;taxe:1;habitation:1;macron:1;qui...,45,F
3,4,rigaud:3;laurent:3;photo:11;profile:8;photopro...,46,F
4,5,societe:1;disparition:1;proche:1;m%c3%a9lanie....,42,F


In [4]:
df.isnull().sum()

ID               0
keywords    846396
age              0
sex              0
dtype: int64

In [5]:
#We drop empty rows
df.dropna(subset=['keywords'], inplace=True)
df.shape

(6418659, 4)

In [6]:
#run this line just to observe how the code works
prueba=df[0:100000].copy()
#or run this one below, if you may want to run it in the complete dataset
#prueba=df.copy()

In [7]:
prueba.shape

(100000, 4)

In [8]:
import nltk
from nltk.corpus import stopwords

We created a function that would split words and their frequencies, and return the words multiplied by their frequencies. We also eliminated stopwords for french language

In [9]:
def split_item(text):
    #start_time = time.time()
    result = ""
    
    words_with_weight = text.split(';')
    for word_with_weight in words_with_weight:
        word_weight = word_with_weight.split(':')
        word = word_weight[0].lower()
        try:
            if word not in stopwords.words('french'):
                result += f"{word} " * int(word_weight[1])
        except: pass
    #end_time = time.time()
    #print(f"function took {(end_time - start_time) * 1000.0} ms")
    return result.strip()

In [10]:
#we run the function
prueba["keywords"]=prueba["keywords"].map(lambda x: split_item(x))

In [11]:
#we print to check if  the function works
prueba.head()

Unnamed: 0,ID,keywords,age,sex
0,1,fibre fibre fibre fibre fibre fibre fibre fibr...,62,F
1,2,restaurant marrakech.shtml,35,M
2,3,payer faq taxe habitation macron detail programme,45,F
3,4,rigaud rigaud rigaud laurent laurent laurent p...,46,F
4,5,societe disparition proche m%c3%a9lanie.gonide...,42,F


In [12]:
prueba["sex"]=prueba["sex"].replace("M",1)
prueba["sex"]=prueba["sex"].replace("F",0)

In [13]:
prueba.head()

Unnamed: 0,ID,keywords,age,sex
0,1,fibre fibre fibre fibre fibre fibre fibre fibr...,62,0
1,2,restaurant marrakech.shtml,35,1
2,3,payer faq taxe habitation macron detail programme,45,0
3,4,rigaud rigaud rigaud laurent laurent laurent p...,46,0
4,5,societe disparition proche m%c3%a9lanie.gonide...,42,0


In [14]:
#After appliying the fuction, there were new cells that shown Null value,
#so it was necessary to drop again these values
prueba.dropna(subset=['keywords'], inplace=True)

In [15]:
#We kept 99.5% of the data
prueba.count()

ID          100000
keywords    100000
age         100000
sex         100000
dtype: int64

# GENDER PREDICTION

Gender Prediction is a classification problem due to we have one variables with two possible labels: "Male" and "Female". Therefore, our prediction will be related with the output probability of being a male or female (in this case using thereshold 0.5)

In [16]:
#Divide into train and test subsets
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(prueba["keywords"],prueba["sex"], test_size = 0.2, random_state = 42)

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
vc_tf_idf = TfidfVectorizer()

In [18]:
#We fit our training data
vc_tf_idf.fit(X_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [19]:
#We transform our data
X_train_tf = vc_tf_idf.transform(X_train)
X_test_tf = vc_tf_idf.transform(X_test)

## Multinomial Naive Bayes

In [20]:
#We import and train the model
prediction = dict()
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train_tf,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [21]:
prediction["Multinomial"] = nb.predict(X_test_tf)

In [22]:
#test accuracy
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
accuracy_score(y_test, prediction["Multinomial"])

0.6109

In [23]:
print("train score:", nb.score(X_train_tf, y_train))
print("test score:", nb.score(X_test_tf, y_test))

train score: 0.7106
test score: 0.6109


## Random Forest Classifier

In [24]:
from sklearn.ensemble import RandomForestClassifier

In [25]:
#We define the model and the number of estimators
rfc = RandomForestClassifier(n_estimators = 50, random_state = 32)

In [26]:
#We train the model on the training dataset
rfc.fit(X_train_tf,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=32, verbose=0,
                       warm_start=False)

In [27]:
#We run the model on the testing dataset
prediction["RFC"] = rfc.predict(X_test_tf)

In [28]:
#We measure the accuracy
accuracy_score(y_test,prediction["RFC"])

0.6038

In [29]:
print("rfc train score:", rfc.score(X_train_tf, y_train))
print("rfc test score:", rfc.score(X_test_tf, y_test))

rfc train score: 0.8802875
rfc test score: 0.6038


# AGE PREDICTION

Age Prediction is a regression problem due to we have one continuos variable for what we are going to predict one unique value

In [30]:
#split again train and test, now for age prediction
X_train,X_test,y_train_a,y_test_a = train_test_split(prueba["keywords"],prueba["age"], test_size = 0.2, random_state = 42)

## Linear Regression

In [31]:
#We train the model on the training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression() 

In [32]:
lm_reg = regressor.fit(X_train_tf,y_train_a) #training the algorithm
age_pred = regressor.predict(X_test_tf)

In [33]:
#To retrieve the intercept:
print(regressor.intercept_)
#For retrieving the slope:
print(regressor.coef_)

45.28722422435149
[ -31.26207373   -0.53375171  -12.7150085  ... -123.88985853 -139.89507898
   -5.76208258]


In [34]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_a, age_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test_a, age_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_a, age_pred)))

Mean Absolute Error: 25.623338460445755
Mean Squared Error: 10050.66424260907
Root Mean Squared Error: 100.25300116509764


## Random Forest

In [35]:
from sklearn.ensemble import RandomForestRegressor

In [36]:
#Define the model estimators
rfr = RandomForestRegressor(n_estimators = 50, random_state = 42)

In [37]:
#We train the model
rfr.fit(X_train_tf, y_train_a)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=50, n_jobs=None, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)

In [38]:
#We run the model on the testing dataset
prediction["RFR"] = rfr.predict(X_test_tf)

In [39]:
#We obtain general performance indicators
errors = abs(prediction["RFR"] - y_test_a)
mape = 100 * (errors / y_test_a)
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 73.77 %.


# Prediction on Test Set

In order to predict the results on the test set, it was decided to use: Multinomial NB for gender, and Linear Regression for age prediction.
Similar to training dataset, first we need to process the data to prepare it for the models to run.

In [40]:
#We import the "Test" dataset
df_test = pd.read_csv(r"C:\Users\Javiera Vines\Documents\Projects\Predicting_age_and_gender\test.csv")

In [41]:
#We drop empty rows
df_test.dropna(subset=['keywords'], inplace=True)
df_test.shape

(2748743, 4)

In [42]:
#Initially we defined prueba_t as a subset of 10,000 rows in order to test the code: prueba_t=df_test[0:10000].copy()
#In order to not re-write the code, we later defined prueba_t as a copy of the original dataset
prueba_t=df_test[0:1000].copy()

In [43]:
prueba_t.head(5)

Unnamed: 0,ID,keywords,age,sex
1,2,cecilia.gosselin:1;flash:1;ville:1;obseques:1;...,,
2,3,p1_1697235:1;peut:1;jcms:1;les:1;acceptees:1;p...,,
3,4,002lundu83vnndv:1,,
4,5,high:3;patisserie:1;apple:3;tech:3;obseques:1;...,,
5,6,disparition:1;vue:1;maelys:1;deuxieme:1;place:...,,


In [44]:
#We executed our function on the dataset to separate words and frequencies just like we did with the training dataset.
prueba_t["keywords"]= prueba_t["keywords"].map(lambda x: split_item(x))

In [45]:
prueba_t.shape

(1000, 4)

After running the function, we can observe that words appear many times depending on their frequencies.
Additionally, we can also observe that some columns are left with a NaN, representing that there is not a valid word that can allow us to predict age and gender (eg. ID 4, that initially had 002lundu83vnndv as a word, and now appears as Nan)

In [46]:
prueba_t.head()

Unnamed: 0,ID,keywords,age,sex
1,2,cecilia.gosselin flash ville obseques economie...,,
2,3,p1_1697235 peut jcms acceptees beneficiaire as...,,
3,4,002lundu83vnndv,,
4,5,high high high patisserie apple apple apple te...,,
5,6,disparition vue maelys deuxieme place actu fla...,,


In [47]:
prueba_t.count()

ID          1000
keywords    1000
age            0
sex            0
dtype: int64

In [48]:
#Dropping again empty rows
prueba_t.dropna(subset=['keywords'], inplace=True)

In [49]:
prueba_t.count()

ID          1000
keywords    1000
age            0
sex            0
dtype: int64

In [50]:
#we transform the whole dataset
test_k = vc_tf_idf.transform(prueba_t["keywords"])

In [51]:
#we run the prediction on gender with the Multinomial NB we trained earlier with the whole training dataset
prediction["gender"] = pd.DataFrame(nb.predict(test_k))
prediction["gender"]

Unnamed: 0,0
0,1
1,1
2,1
3,1
4,1
...,...
995,1
996,0
997,1
998,0


In [52]:
#We make sure we have no null values
prediction['gender'].isnull().sum()
prediction['gender'].count()

0    1000
dtype: int64

In [53]:
#We predict age with the linear regression model we trained earlier with the whole training dataset
prediction["age"] = pd.DataFrame(regressor.predict(test_k))
prediction["age"]

Unnamed: 0,0
0,176.562748
1,57.109504
2,45.287224
3,44.317775
4,44.998148
...,...
995,70.393437
996,47.970780
997,32.809789
998,47.676349


In [54]:
#we integrate the resutls into the database.
prueba_t['sex_pred']=prediction['gender']
prueba_t['age_pred']=prediction['age']
prueba_t.head(5)

Unnamed: 0,ID,keywords,age,sex,sex_pred,age_pred
1,2,cecilia.gosselin flash ville obseques economie...,,,1.0,57.109504
2,3,p1_1697235 peut jcms acceptees beneficiaire as...,,,1.0,45.287224
3,4,002lundu83vnndv,,,1.0,44.317775
4,5,high high high patisserie apple apple apple te...,,,1.0,44.998148
5,6,disparition vue maelys deuxieme place actu fla...,,,1.0,44.998148


# Exporting the outputs

In [55]:
#We export the results into a final dataset with the information required
Results = prueba_t[['ID','age_pred','sex_pred']].copy()
Results["sex_pred"]=Results["sex_pred"].replace(1,"M")
Results["sex_pred"]=Results["sex_pred"].replace(0,"F")
Results.head(5)

Unnamed: 0,ID,age_pred,sex_pred
1,2,57.109504,M
2,3,45.287224,M
3,4,44.317775,M
4,5,44.998148,M
5,6,44.998148,M


In [56]:
Results.isnull().sum()
Results.count()

ID          1000
age_pred     870
sex_pred     870
dtype: int64

In [57]:
#We export the csv file
Results.to_csv('Results_test.csv')