# Naturale Language Processing notebook example

Welcome to NLP notebook example!

The first time you run this notebook execute the following line (you can comment it next time): be sure to have requirements.txt

In [54]:
#!pip3 install -U -r requirements.txt

In [55]:
import pandas as pd 
import numpy as np
from gensim.models import Word2Vec
from time import time
import csv
import preprocessing as pp
import nlp_utility

# read from minio

In [56]:
#!pip3 install -U liboidcagent requests xmltodict pandas boto3

In [57]:
!eval `oidc-keychain` > /dev/null && oidc-token dodas --time=3600 > /tmp/token
with open('/tmp/token') as f:
    token = f.readlines()[0].split("\n")[0]

In [58]:
import requests
import xmltodict
r = requests.post("https://minio.cloud.infn.it",
                  data={
                      'Action':
                      "AssumeRoleWithWebIdentity",
                      'Version': "2011-06-15",
                      'WebIdentityToken': token,
                      'DurationSeconds': 9000
                  },
                  verify=True)

tree = xmltodict.parse(r.content)

credentials = dict(tree['AssumeRoleWithWebIdentityResponse']
                    ['AssumeRoleWithWebIdentityResult']['Credentials'])

In [60]:

import boto3
s3 = boto3.client('s3',endpoint_url="https://minio.cloud.infn.it",aws_access_key_id=credentials["AccessKeyId"],
                  aws_secret_access_key=credentials["SecretAccessKey"],
                  aws_session_token=credentials["SessionToken"],
                  config=boto3.session.Config(signature_version='s3v4'),
                 verify=True)



In [62]:
# Read test CSV

read_test = s3.get_object(Bucket='scratch', Key='verlato/dpc-covid19-ita-regioni.csv')
df_test = pd.read_csv(read_test['Body'],sep=',')
df_test.head()

AttributeError: 'NoneType' object has no attribute 'items'

                  data stato  codice_regione denominazione_regione        lat  \
0  2020-02-24T18:00:00  ITA   13              Abruzzo               42.351222   
1  2020-02-24T18:00:00  ITA   17              Basilicata            40.639471   
2  2020-02-24T18:00:00  ITA   18              Calabria              38.905976   
3  2020-02-24T18:00:00  ITA   15              Campania              40.839566   
4  2020-02-24T18:00:00  ITA   8               Emilia-Romagna        44.494367   

        long  ricoverati_con_sintomi  terapia_intensiva  totale_ospedalizzati  \
0  13.398438  0                       0                  0                      
1  15.805148  0                       0                  0                      
2  16.594402  0                       0                  0                      
3  14.250850  0                       0                  0                      
4  11.341721  10                      2                  12                     

   isolamento_domiciliare 

In [None]:

#read_file
read_file = s3.get_object(Bucket='legger', Key='NLPInput/message_example.csv')
df = pd.read_csv(read_file['Body'],sep=',')
df.head()

# read local file

In [None]:
df=pd.read_csv('message_example.csv')

In [None]:
df.head()

In [None]:
pd.set_option('display.max_colwidth', -1)
df.head()

## Preprocessing
Messages are cleaned from file paths, urls etc.(this process can be time requiring)

Tokenization is done inline and tokens cleaned from a stopword list are given as input to Word2Vec model.

In [None]:
before_number=df.shape[0] #message number before cleaning
print('...cleaning')
start_time_preproc= time()
df['cleaned_strings'] = pp.clean_messages(df['error_message'])
prepr_time=time() - start_time_preproc


In [None]:
df_cleaned=df.copy()

In [None]:
df_cleaned.drop_duplicates(['cleaned_strings'],inplace=True)
after_number=df_cleaned.shape[0] #message number after cleaning

In [None]:
print("number of  messages before cleaning:",before_number)
print("number of  messages after cleaning:",after_number)

In [None]:
df_cleaned

In [None]:
c=nlp_utility.callback()

In [None]:
corpus = pp.MyCorpus(df_cleaned)
print('...starting training')
start_time_train = time()
model = Word2Vec(sentences=corpus,compute_loss=True,size=300,window=7, min_count=1, workers=4, iter=30,callbacks=[c])
tot_time=time() - start_time_train
model.save('example_model.model')

In [None]:
from matplotlib import pyplot as plt
fig, ax=plt.subplots(figsize=(13,8))
ax.scatter(np.arange(0,len(c.loss_vec)),c.loss_vec)
ax.set_xlabel('iter')
ax.set_ylabel('delta loss')


In [None]:
print('preprocessing time: %.2f seconds'% prepr_time)
print('training time: %.2f seconds'%tot_time)

### Just few words about MyCorpus object

Why didn't we feed word2vec model with a plain Python list of tokens? MyCorpus object is much more memory friendly! A list would reside fully in the memory;with MyCorpus object, instead, **at most one vector** resides in RAM **at a time**. In such a way the corpus can be as large as we want.

In [None]:
print(corpus)

*print* just outputs address of the object in memory. To see the constituent vectors, let’s iterate over the corpus and print each document vector (one at a time):

In [None]:
for line in corpus:
    print(line)

## Understanding Word2Vec
Let's play a bit with our model to understand what it is doing.

Getting model vocabulary and total amount of words:

In [None]:
len(model.wv.vocab)

In [None]:
model.wv.vocab

Checking the "most similar words", using the default "cosine similarity" measure:

In [None]:
print(model.wv.most_similar(positive=['pull','push'])) #sum

In [None]:
print(model.wv.most_similar(positive=['pull'],negative=['push']))#difference

Getting similarity score for each pair and checking not matching word:

In [None]:
pairs = [
    ('pull', 'push'),  #similar role
    ('pull', 'copy'),   #often close
    ('pull', 'time'),  #sometimes appearing together
    ('pull', 'directory'),    # sometimes appearing together
    ('pull', 'not'),
]
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, model.wv.similarity(w1, w2)))


In [None]:
print(model.wv.doesnt_match(['pull','push','mode','not']))

In [None]:
df_cleaned['error_message'][df_cleaned['error_message'].astype(str).str.contains('pull',na=False)]

In [None]:
df_cleaned[(df_cleaned['error_message'].astype(str).str.contains('pull',na=False)) & (df_cleaned['error_message'].astype(str).str.contains('directory',na=False) )]

Even if 'pull' and 'push' never appear together the model understands they have similar role (high similarity score): 

In [None]:
df_cleaned[(df_cleaned['error_message'].astype(str).str.contains('pull',na=False)) & (df_cleaned['error_message'].astype(str).str.contains('push',na=False) )]

Dictionary of known words and their frequency in the corpus:

In [None]:
w2c = dict()
for item in model.wv.vocab:
    w2c[item] = model.wv.vocab[item].count
w2cSorted=dict(sorted(w2c.items(), key=lambda x: x[1],reverse=True))
w2cSortedList = list(w2cSorted.keys())
w2cSorted

## Getting more into NLP tasks

### Example of Supervised Learning
Let's test model ability to associate the right error category.

In [None]:
model_name='example_model.model'
model=Word2Vec.load(model_name)

In [None]:
vectors_sent=nlp_utility.vectorize_messages(df_cleaned['cleaned_strings'],model,tf_idf=True)

In [None]:
len(vectors_sent)


In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test,y_train, y_test = train_test_split(vectors_sent,df_cleaned['error_category'].values,test_size=0.2, random_state = 42)
mex_train, mex_test,cat_train, cat_test = train_test_split(df_cleaned['error_message'].values,df_cleaned['error_category'].values,test_size=0.2, random_state = 42)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

logreg = LogisticRegression(n_jobs=1, C=1e5,class_weight='balanced',multi_class='auto',solver='lbfgs',max_iter=170)
#print(y_train)
logreg = logreg.fit(x_train, y_train)
#y_pred = logreg.predict(x_test)
#print('accuracy %s' % accuracy_score(y_test, y_pred))
#print(classification_report(y_test, y_pred))

Possible warnings if some labels in y_test don't appear in y_pred (i.e some labels are never predicted)

In [None]:
set(y_test) - set(y_pred) #if empty you shouldn't have warnings

In [None]:
d={'y_test':y_test,'y_pred':y_pred,'mex':mex_test}
df_comparison=pd.DataFrame(data=d)
df_comparison

### 2-D Visualization of vector space: words and error messages (cleaned)

In [None]:
#allows interactive cursors

In [None]:
%matplotlib nbagg 

In [None]:
title='vector_visualization.png'
x_vals, y_vals, labels =nlp_utility.reduce_dimensions(model)
figsize=(16, 8)
nlp_utility.plot_with_matplotlib(x_vals, y_vals, labels,figsize,title=title)


Click on a point to visualize the annotation associated.
To move the box, click on it and drag.

In [None]:
title='sentence_visualization.png'

df_cleaned['vectors_sent']=vectors_sent.tolist()
x_vals_sent, y_vals_sent, labels_sent=nlp_utility.reduce_sent_dimensions(df_cleaned)
figsize=(16,8)
nlp_utility.plot_with_matplotlib(x_vals_sent, y_vals_sent, labels_sent,figsize,npoints=0,save=True,title=title)

