### Loading data and libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model, preprocessing, model_selection,feature_extraction

In [2]:
from sklearn.model_selection import RandomizedSearchCV

In [3]:
import matplotlib.pyplot as plt

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
from nlp_utils.model import train_model

In [6]:
from sklearn import decomposition

In [7]:
data_folder = 'data/'

In [8]:
## using the cleaned files
train_data = pd.read_csv(data_folder+'train_clean.csv'); print(train_data.shape)
test_data = pd.read_csv(data_folder+'test_clean.csv'); print(test_data.shape)

(7613, 6)
(3263, 5)


In [9]:
# train_data.head().iloc[4]['text']

In [10]:
train_data[train_data['target'] == 1].sample()['text'].values[0]

"Well Saturn doesn't exist anymore. So the collision place has a starting estimate of $4000. That's 3 times what my car is worth."

In [11]:
# train_data.location.value_counts()

In [12]:
sample_submission = pd.read_csv(data_folder+'sample_submission.csv')
# sample_submission.head()

### Count vectors

In [13]:
# create a count vectorizer object 
count_vect = feature_extraction.text.CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(train_data['text_clean'])


CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='\\w{1,}', tokenizer=None,
        vocabulary=None)

In [14]:
count_train = count_vect.transform(train_data['text_clean'])
count_test = count_vect.transform(test_data['text_clean'])

In [15]:
count_train_x, count_valid_x, count_train_y, count_valid_y = train_test_split(count_train, train_data['target'], 
                                                                              test_size = 0.15, random_state = 44)

In [16]:
count_train_y.value_counts()

0    3696
1    2775
Name: target, dtype: int64

### Latent Dirilecht Allocation

In [44]:
num_topics = 20
iterations = 20

In [45]:
lda_model = decomposition.LatentDirichletAllocation(n_components=num_topics,learning_method='online',max_iter=iterations)

In [46]:
x_topics = lda_model.fit_transform(count_train_x)

In [47]:
x_topics.shape

(6471, 20)

In [48]:
x_topics_valid = lda_model.transform(count_valid_x)
x_topics_valid.shape

(1142, 20)

In [49]:
test_topics = lda_model.transform(count_test)

In [50]:
# decomposition.LatentDirichletAllocation?

In [51]:
topic_word = lda_model.components_

In [52]:
[topic_word[2].argmax()]

[3747]

In [53]:
vocab = count_vect.get_feature_names()

In [54]:
vocab[9738]

'ok'

In [55]:
# vocab

In [56]:
# get topics

In [57]:
n_top_words = 3

In [58]:
topic_summaries = []

In [59]:
topic_word.shape

(20, 15700)

In [60]:
for i,topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:(-n_top_words+1):-1]
    topic_summaries.append(' '.join(topic_words))
    

In [61]:
topic_summaries

['im',
 'time',
 'disaster',
 'emergency',
 'like',
 'attack',
 'first',
 'best',
 'û',
 'amp',
 'fire',
 'year',
 'video',
 'wreck',
 'say',
 'life',
 'one',
 'via',
 'wildfire',
 'know']

### RidgeClassifier

In [62]:
clf = linear_model.RidgeClassifier(class_weight='balanced', random_state=42, alpha=5)

In [63]:
# clf.fit(count_train_x, count_train_y)

In [64]:
## cross validating
scores = model_selection.cross_val_score(clf, x_topics, count_train_y, scoring='f1', cv=3)

In [65]:
scores

array([0.5764411 , 0.59939455, 0.59910135])

In [66]:
train_model(clf, x_topics, count_train_y, x_topics_valid, count_valid_y, test_topics
#             , 
#             submissions_data=sample_submission, submissions_file_prefix="tfidf_ridge_submissions" 
           )

Classification report : 

              precision    recall  f1-score   support

           1       0.57      0.66      0.61       496
           0       0.70      0.63      0.66       646

   micro avg       0.64      0.64      0.64      1142
   macro avg       0.64      0.64      0.64      1142
weighted avg       0.65      0.64      0.64      1142



### Logistic Regression

In [67]:
clf_2 = linear_model.LogisticRegressionCV(random_state=42, scoring = 'f1', class_weight='balanced', cv = 3, max_iter=1000
                                          #, Cs=np.arange(1.5,2.3,0.1)
                                         )

In [68]:
clf_2.fit(x_topics, count_train_y)

LogisticRegressionCV(Cs=10, class_weight='balanced', cv=3, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=1000,
           multi_class='warn', n_jobs=None, penalty='l2', random_state=42,
           refit=True, scoring='f1', solver='lbfgs', tol=0.0001, verbose=0)

In [69]:
clf_2.scores_

{1: array([[0.56777494, 0.57214681, 0.57200403, 0.57070707, 0.57746479,
         0.57701957, 0.57630522, 0.57630522, 0.57630522, 0.57630522],
        [0.58541667, 0.595226  , 0.59541985, 0.60212874, 0.59757331,
         0.59656218, 0.59626074, 0.59626074, 0.59626074, 0.59626074],
        [0.5945674 , 0.5990099 , 0.59851117, 0.5988024 , 0.59679037,
         0.59829915, 0.598     , 0.598     , 0.598     , 0.598     ]])}

In [70]:
clf_2.Cs_

array([1.00000000e-04, 7.74263683e-04, 5.99484250e-03, 4.64158883e-02,
       3.59381366e-01, 2.78255940e+00, 2.15443469e+01, 1.66810054e+02,
       1.29154967e+03, 1.00000000e+04])

In [71]:
clf_2.C_

array([2.7825594])

In [72]:
logistic_model = linear_model.LogisticRegression(C = 2.7, random_state=42, max_iter=500, class_weight = 'balanced')

In [73]:
train_model(logistic_model, x_topics, count_train_y, x_topics_valid, count_valid_y, test_topics, 
            submissions_data=sample_submission, submissions_file_prefix="topic_modeling_logistic_submissions"  )

Classification report : 

              precision    recall  f1-score   support

           1       0.58      0.66      0.62       496
           0       0.71      0.64      0.67       646

   micro avg       0.65      0.65      0.65      1142
   macro avg       0.65      0.65      0.65      1142
weighted avg       0.65      0.65      0.65      1142

Exporting data to: 

	 data/topic_modeling_logistic_submissions_20210218111236.csv


