In [17]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import RandomizedSearchCV

import re

from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
tknzr = TweetTokenizer()
lemm = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

import pickle

In [15]:
%reload_ext watermark
%watermark -v -p pandas,scikit-learn,nltk,re,numpy

Python implementation: CPython
Python version       : 3.7.4
IPython version      : 7.8.0

pandas      : 1.2.3
scikit-learn: 0.0
nltk        : 3.4.5
re          : 2.2.1
numpy       : 1.19.5



In [3]:
df = pd.read_csv('17SDGpt2_tweets.csv', lineterminator = '\n', index_col = 0)

In [4]:
def decontracted(phrase):
    """decontracts the words to its expanded form"""
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [5]:
def clean_tweet(df):
    """function first creates a copy. Then cleans up text for http, @, ampersands, and clears for punctuations.  Then lemmatizes, tokenizes.
    """
    df = df.copy()
    
    #decontract
    df = df.apply(decontracted)
    #clean up https,@, &amp
    df = df.apply(lambda x: re.sub(r"http\S+","", x.lower()),1)\
    .apply(lambda i: " ".join(filter(lambda x: x[0]!="@", i.split())),1)\
    .apply(lambda x: re.sub(r"&amp", "",x),1)\
    .apply(lambda x: re.sub(r"&amp;","",x))\
    .apply(lambda x: re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', x),1)
    
    #lemmatize
    df = df.apply(lambda x: lemm.lemmatize(x))
    
    #tokenize
    df = df.apply(lambda x: tknzr.tokenize(x))
    df = df.apply(lambda x: ' '.join(x))
    return df

In [6]:
#Create Clean Tweets column and use only the labels and clean tweets
df['clean_tweet'] = clean_tweet(df['tweet'])
df = df[~df['clean_tweet'].isnull()]
df = df[['sdg', 'clean_tweet']]
df['sdg'] = df['sdg'].astype(int)
df.head()

Unnamed: 0,sdg,clean_tweet
0,16,our spousal sponsorship application sent to th...
1,16,star wars the vintage collection vc186 boba fe...
2,16,cds is hiring a team lead for our delivery pol...
3,16,was watching him on show … a person that was r...
4,16,as written xiv directly contradicts medical ad...


In [7]:
#TFIDF vectorizer 
tfidf = TfidfVectorizer()

#Column Transformer 
ct = make_column_transformer(
    (tfidf, 'clean_tweet'),
    remainder='passthrough')

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df[['clean_tweet']], df['sdg'], test_size=0.3, random_state=777)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((113652, 1), (48708, 1), (113652,), (48708,))

In [9]:
lr = LogisticRegression(solver = 'saga', max_iter = 10000)
model = make_pipeline(ct, lr)
model.fit(X_train,y_train);

In [10]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.89      0.83      0.86      1807
           2       0.82      0.75      0.79      1798
           3       0.66      0.62      0.64      3817
           4       0.81      0.90      0.85      6486
           5       0.80      0.80      0.80      2366
           6       0.81      0.68      0.74      1758
           7       0.83      0.75      0.79      2393
           8       0.71      0.73      0.72      2408
           9       0.79      0.83      0.81      4209
          10       0.80      0.73      0.76      2471
          11       0.72      0.77      0.74      2898
          12       0.90      0.86      0.88      3031
          13       0.75      0.69      0.72      2096
          14       0.80      0.76      0.78      1660
          15       0.81      0.83      0.82      3901
          16       0.60      0.69      0.64      3440
          17       0.78      0.75      0.76      2169

    accuracy              

In [11]:
#Hyperparamater Tuning Logistic Regression 
'''
'columntransformer__tfidfvectorizer__max_features', 
'logisticregression__max_iter'
'columntransformer__tfidfvectorizer__ngram_range'
'logisticregression__C'
'''
model.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'columntransformer', 'logisticregression', 'columntransformer__n_jobs', 'columntransformer__remainder', 'columntransformer__sparse_threshold', 'columntransformer__transformer_weights', 'columntransformer__transformers', 'columntransformer__verbose', 'columntransformer__tfidfvectorizer', 'columntransformer__tfidfvectorizer__analyzer', 'columntransformer__tfidfvectorizer__binary', 'columntransformer__tfidfvectorizer__decode_error', 'columntransformer__tfidfvectorizer__dtype', 'columntransformer__tfidfvectorizer__encoding', 'columntransformer__tfidfvectorizer__input', 'columntransformer__tfidfvectorizer__lowercase', 'columntransformer__tfidfvectorizer__max_df', 'columntransformer__tfidfvectorizer__max_features', 'columntransformer__tfidfvectorizer__min_df', 'columntransformer__tfidfvectorizer__ngram_range', 'columntransformer__tfidfvectorizer__norm', 'columntransformer__tfidfvectorizer__preprocessor', 'columntransformer__tfidfvectorizer__smooth_idf

In [18]:
#Set parameters for randomforest to tune 
params = {}

params['columntransformer__tfidfvectorizer__min_df'] = np.arange(0.0001,0.001,0.0001)
params['columntransformer__tfidfvectorizer__ngram_range'] = [(1,3), (1,2), (1,1),(2,2), (3,3)]
params['logisticregression__C'] = np.arange(0,1,0.1)
params['logisticregression__max_iter'] = np.arange(10000,20000,1000)
params['logisticregression__solver'] = ['saga']

In [19]:
clf = RandomizedSearchCV(model, params, random_state=777)
clf.fit(X_train,y_train);

In [20]:
#Looking at the best model results 
results = pd.DataFrame(clf.cv_results_)
results.sort_values('rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_logisticregression__solver,param_logisticregression__max_iter,param_logisticregression__C,param_columntransformer__tfidfvectorizer__ngram_range,param_columntransformer__tfidfvectorizer__min_df,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
3,9.246157,0.136309,0.828516,0.009457,saga,17000,0.8,"(1, 2)",0.0006,"{'logisticregression__solver': 'saga', 'logist...",0.759579,0.75782,0.761417,0.765332,0.759041,0.760638,0.002617,1
7,13.522171,0.498132,1.417569,0.025169,saga,18000,0.7,"(1, 3)",0.0008,"{'logisticregression__solver': 'saga', 'logist...",0.756148,0.753728,0.755037,0.760097,0.755829,0.756168,0.002135,2
4,4.910943,0.180703,0.440954,0.014164,saga,12000,0.1,"(1, 1)",0.0009,"{'logisticregression__solver': 'saga', 'logist...",0.728521,0.724869,0.728201,0.730796,0.727629,0.728003,0.001901,3
1,13.054611,0.171013,1.368712,0.017088,saga,17000,0.1,"(1, 3)",0.0009,"{'logisticregression__solver': 'saga', 'logist...",0.714091,0.710659,0.709855,0.715486,0.712407,0.7125,0.00209,4
2,9.051492,0.323113,0.826395,0.006932,saga,19000,0.1,"(1, 2)",0.0005,"{'logisticregression__solver': 'saga', 'logist...",0.708768,0.7056,0.705851,0.709063,0.706599,0.707176,0.00146,5


In [21]:
#The best score 
clf.best_score_

0.7606377703550679

In [22]:
# save the model to disk
filename = 'lr_finalized_model.sav'
pickle.dump(clf.best_estimator_, open(filename, 'wb'))

In [23]:
#save modelfitter and columntransformer
filename = 'lr_columntransformer.sav'
pickle.dump(ct, open(filename, 'wb'))

filename = 'lr_model_fit.sav'
pickle.dump(model, open(filename,'wb'))

In [24]:
results.to_csv('lr_results.csv')

In [25]:
df2 = pd.read_csv('val_set_sdg_1_7_8_12_13_toy.csv')
df3= pd.read_csv('train_set_sdg_1_7_8_12_13_toy.csv')
df4 = pd.read_csv('eval_set_sdg_1_7_8_12_13_curated_journals_toy.csv')

In [26]:
df_list = pd.concat([df2,df3,df4], ignore_index = True)
df_list= df_list[['title','sdg_id']].rename(columns = {'title':'clean_tweet', 'sdg_id':'sdg'})
df_list['sdg'] = df_list['sdg'].astype(int)
df_list

Unnamed: 0,clean_tweet,sdg
0,The transition from college to work during the...,1
1,Sustainability entrepreneurship and equitable ...,8
2,Determination of 41 polybrominated diphenyl et...,12
3,Mainstreaming sustainability into biodiversity...,12
4,A modular gene targeting system for sequential...,12
...,...,...
345,Policy options for sustainable development in ...,8
346,"A semi-empirical, electrochemistry-based model...",7
347,Impact of natural fractures in reservoir model...,8
348,Optimization of the medium composition for the...,7


In [27]:
y_pred2 = clf.predict(df_list[['clean_tweet']])

In [29]:
y_pred2

array([ 8, 13, 15, 15, 17, 15,  1, 14,  7, 16, 15, 15,  8, 13, 13,  9, 13,
       17,  9, 16, 12, 15,  7, 15, 12, 15, 11,  7, 14,  4,  7,  5,  9,  3,
       13,  3,  3, 17,  4,  9,  5, 13,  7, 11,  1,  9, 15, 12, 17, 13, 14,
        3, 14, 10,  3, 16, 15, 16,  2,  3, 15, 17, 12, 10,  7, 13,  9,  7,
       11,  9, 17,  8, 17,  9,  3,  4,  4,  8,  6, 17, 15,  8,  9,  9,  3,
       10,  8, 15, 15, 15, 13, 13, 15, 15, 12, 10, 10, 11,  9,  3,  9,  8,
       10, 15,  4, 12, 13,  7, 15, 17, 11,  1, 13, 12,  3, 13, 15, 15, 15,
        7,  9,  9, 12, 10, 13,  2, 15,  3,  3,  4,  9, 13,  3, 10,  3,  3,
        6,  9, 16, 15, 11,  3,  3, 15,  5, 12, 13, 14, 13, 17,  3, 13, 15,
        2, 16,  3, 14,  8, 15, 11,  3,  9, 11, 16, 10, 13, 14,  4,  7, 13,
        1, 10, 12, 16, 12, 12, 10,  3, 17,  7,  4,  9,  7, 15,  3, 12, 11,
        7,  3,  9, 17, 15, 14,  7,  3, 16, 11,  3,  3, 13,  3,  3,  4,  7,
        7,  7,  9, 10,  9, 15, 12, 15,  3,  9, 15, 12,  4,  7, 15, 11, 11,
       13,  4,  7, 15, 13

In [28]:
#Performs poorly outside the dataset need to collect more data this way 
print(classification_report(df_list['sdg'], y_pred2))

              precision    recall  f1-score   support

           1       0.60      0.07      0.13        41
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.70      0.22      0.33       119
           8       0.69      0.10      0.18        88
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         0
          12       0.48      0.23      0.31        48
          13       0.58      0.39      0.47        54
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         0
          16       0.00      0.00      0.00         0
          17       0.00      0.00      0.00         0

    accuracy              

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
