# Imports

In [24]:
import os
import re
import nltk
import zipfile
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import precision_recall_fscore_support
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers  import Embedding
from keras.models import Sequential
from keras.layers import Flatten, Dense
from keras import layers
from keras import preprocessing

In [2]:
os.environ["CUDA_VISIBLE_DEVICES"]="0"

# Download and extract the data

In [3]:
if('blogs.zip' not in os.listdir()):
    !curl 'http://u.cs.biu.ac.il/~koppel/blogs/blogs.zip' -H 'Connection: keep-alive' -H 'Upgrade-Insecure-Requests: 1' -H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8' -H 'Referer: http://u.cs.biu.ac.il/~koppel/BlogCorpus.htm' -H 'Accept-Encoding: gzip, deflate' -H 'Accept-Language: en-US,en;q=0.9' --compressed --output blogs.zip
    with zipfile.ZipFile('blogs.zip', 'r') as zip_ref:
        zip_ref.extractall('./')

# Data cleaning and exploration

In [4]:
bloggers = os.listdir('blogs/')
print("There are {} bloggers".format(len(bloggers)))

There are 19320 bloggers


In [5]:
path = 'blogs/'

f = open(path+bloggers[0],'r',encoding="utf-8")

xml = f.read()

tags = set()
l = re.findall('<[^/].*?>',xml)
for el in l:
    tags.add(el)
print("The set of tags in the xml files are: {}".format(tags))

The set of tags in the xml files are: {'<Blog>', '<date>', '<post>'}


In [6]:
def data_2_frame(bloggers, sw = True):
    
    idx = []
    gender = []
    age = []
    industry = []
    astrologic = []
    dates = []
    posts = []
    
    stop_words = nltk.corpus.stopwords.words("english")
    
    for blogger in bloggers:

        try:

            f = open(path+blogger,'r',encoding="utf-8")
            xml = f.read()

        except UnicodeDecodeError:

            f = open(path+blogger,'r',encoding="latin-1")
            xml = f.read()
        
        if(sw):
            xml = re.sub(r'\b(' + r'|'.join(stop_words) + r')\b\s*','', xml)
            
        temp = re.findall('<post.*?>(.*?)</post>',xml,flags=re.DOTALL)
        
        posts += temp
        
        dates += re.findall('<date.*?>(.*?)</date>',xml,flags=re.DOTALL)

        nb_posts = len(temp)

        tmp = blogger.split(".")

        idx        += [tmp[0]]*nb_posts
        gender     += [tmp[1]]*nb_posts
        age        += [tmp[2]]*nb_posts
        industry   += [tmp[3]]*nb_posts
        astrologic += [tmp[4]]*nb_posts
    
    
    dico = {'id':idx,
        'gender':gender,
        'age':age,
        'industry':industry,
        'astrologic':astrologic,
        'post':posts,
        'date':dates}

    return pd.DataFrame.from_dict(dico)

In [7]:
data = data_2_frame(bloggers,True)

In [8]:
data.describe()

Unnamed: 0,id,gender,age,industry,astrologic,post,date
count,681288,681288,681288,681288,681288,681288,681288
unique,19320,2,26,40,12,612001,2616
top,449628,male,17,indUnk,Cancer,\n\n\t \n urlLink \n,"02,August,2004"
freq,4221,345197,80859,251015,65048,399,16545


In [9]:
data.head()

Unnamed: 0,id,gender,age,industry,astrologic,post,date
0,4200843,male,39,Technology,Sagittarius,"\n\n \n Why suns , obsession trying l...","21,August,2004"
1,4200843,male,39,Technology,Sagittarius,\n\n \n I'ordered Counter Strike:CZ ...,"18,August,2004"
2,4200843,male,39,Technology,Sagittarius,\n\n \n Remember jokes? Well I I nev...,"18,August,2004"
3,4200843,male,39,Technology,Sagittarius,\n\n \n If 'watch 'Waking Dead' make...,"16,August,2004"
4,4200843,male,39,Technology,Sagittarius,\n\n \n Sunday always feels like Sun...,"15,August,2004"


In [10]:
words = []
posts = data.post.values
word_per_post = []

for post in posts:
    
    words+=post.split(" ")
    
    word_per_post.append(len(post.split(" ")) - np.sum(np.array(post.split(" ")) == ""))
    
unique_words = pd.unique(words)

print("There are {} words and {} unique words, that are non stop words in the corpus.".format(len(words),len(unique_words)))
print("Maximum words number per post: {}\nMinimum words number per post: {}\nAverage words number per post: {}\nMedian: {}\n99% of posts contains less or equal than {} words"\
      .format(np.max(word_per_post),np.min(word_per_post),np.mean(word_per_post),np.median(word_per_post),np.sort(word_per_post)[np.int(len(word_per_post)*0.99)]))

There are 101264977 words and 3912652 unique words, that are non stop words in the corpus.
Maximum words number per post: 78477
Minimum words number per post: 2
Average words number per post: 121.63884730099458
Median: 71.0
99% of posts contains less or equal than 735 words


# Baseline

In [16]:
maxlen  = 735

max_words = 100000

texts = data.post.values

tokenizer = Tokenizer(num_words = max_words)

tokenizer.fit_on_texts(texts)

sequences = np.array(tokenizer.texts_to_sequences(texts))

word_index = tokenizer.word_index

print('Found %s unique tokens.' % len(word_index))

Found 1218159 unique tokens.


In [17]:
X = preprocessing.sequence.pad_sequences(sequences, maxlen=maxlen)

y = data.gender.values

sss = StratifiedShuffleSplit(test_size = 0.25)

# utilise plabels au lieu de labels si tu veux tester sur les periodes
for train_index, test_index in sss.split(X,y):
    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

y_train = pd.get_dummies(y_train)
y_test = pd.get_dummies(y_test)

X_train.shape,X_test.shape,y_train.shape,y_test.shape

((510966, 735), (170322, 735), (510966, 2), (170322, 2))

In [18]:
embedding_dim = 200

In [19]:
model = Sequential()
model.add(Embedding(max_words,embedding_dim, input_length = maxlen))
model.add(Flatten())
model.add(Dense(20, activation="sigmoid"))

model.add(Dense(1, activation='sigmoid'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 735, 200)          20000000  
_________________________________________________________________
flatten_2 (Flatten)          (None, 147000)            0         
_________________________________________________________________
dense_3 (Dense)              (None, 20)                2940020   
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 21        
Total params: 22,940,041
Trainable params: 22,940,041
Non-trainable params: 0
_________________________________________________________________


In [20]:
model.compile(optimizer='rmsprop', loss='binary_crossentropy',metrics=["acc"])

history = model.fit(X_train, y_train['male'].values, epochs = 10, batch_size = 32, validation_split = 0.2)

Train on 408772 samples, validate on 102194 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [21]:
results = model.evaluate(X_test, y_test['male'].values)
print("evaluate on test set ",results)

evaluate on test set  [1.0146219053918617, 0.6245170911560503]


In [22]:
results = model.evaluate(X_train, y_train['male'].values)
print("evaluate on train set ",results)

evaluate on train set  [0.3403199375824062, 0.8655663977633016]


In [45]:
y_pred = model.predict(X_test)

pr, rc, fs,_ = precision_recall_fscore_support(y_test['male'].values,y_pred[:,0].round(),average='micro')

In [46]:
print("precision is {}%, recall is {}% and f1 score is {}%".format(pr*100, rc*100, fs*100))

precision is 62.451709115675015%, recall is 62.451709115675015% and f1 score is 62.451709115675015%
