# Kaggle Competition:
# <center> Natural Language Processing with Disaster Tweets

### Predict which Tweets are about real disasters and which ones are not

In [3]:
# import libraries

import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score


#classification models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import  RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import RidgeClassifierCV
from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.ensemble import ExtraTreesClassifier
from lightgbm.sklearn import LGBMClassifier

In [4]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jesic\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jesic\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jesic\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
rndm = 123

In [6]:
# load the datasets
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
df_sample = pd.read_csv("sample_submission.csv")

In [7]:
df_train.sample(5)

Unnamed: 0,id,keyword,location,text,target
42,62,ablaze,milky way,Had an awesome time visiting the CFC head offi...,0
6085,8692,sinking,"Cypress, CA 90630",Do you feel like you are sinking in low self-i...,0
1064,1535,bomb,,The Guardian view on the Hiroshima legacy: sti...,1
5512,7865,quarantined,,Officials: Alabama home quarantined over possi...,1
7091,10156,upheaval,Connecticut,A look at state actions a year after Ferguson'...,1


In [8]:
df_test.sample(5)

Unnamed: 0,id,keyword,location,text
778,2567,crash,"Austin, TX",lol at the guy whipping by me on a double yell...
351,1135,blight,"Cleveland, Ohio",Lots of talk of how susceptible NY2 is to shoo...
2899,9604,thunder,,Idk if ithats thunder or people rolling in the...
2354,7875,quarantined,Venezuela,Officials: Alabama home quarantined over possi...
1098,3615,desolation,Quezon City,The Hobbit: The Desolation of Smaug ?? ? atm.


In [9]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [10]:
df_train["keyword"].value_counts()

keyword
fatalities               45
deluge                   42
armageddon               42
sinking                  41
damage                   41
                         ..
forest%20fire            19
epicentre                12
threat                   11
inundation               10
radiation%20emergency     9
Name: count, Length: 221, dtype: int64

In [11]:
df_train["location"].value_counts()

location
USA                    104
New York                71
United States           50
London                  45
Canada                  29
                      ... 
MontrÌ©al, QuÌ©bec       1
Montreal                 1
ÌÏT: 6.4682,3.18287      1
Live4Heed??              1
Lincoln                  1
Name: count, Length: 3341, dtype: int64

In [12]:
#check duplicate text
text_duplicates = df_train[df_train.duplicated(subset=['text'])].sort_values(by="text")["text"].drop_duplicates()
text_duplicates

4299    #Allah describes piling up #wealth thinking it...
6366    #Bestnaijamade: 16yr old PKK suicide bomber wh...
2828    #KCA #VoteJKT48ID 12News: UPDATE: A family of ...
2841    #Myanmar  Displaced #Rohingya at #Sittwe point...
1725    #Newswatch: 2 vehicles collided at Lock and La...
                              ...                        
4631    incident with injury:I-495  inner loop Exit 31...
1349    like for the music video I want some real acti...
3461    that exploded &amp; brought about the\nbeginni...
6103    that horrible sinking feeling when youÛªve be...
5641    wowo--=== 12000 Nigerian refugees repatriated ...
Name: text, Length: 69, dtype: object

In [13]:
df_train[df_train.duplicated(subset=['text'])].sort_values(by="text").iloc[0,3]

'#Allah describes piling up #wealth thinking it would last #forever as the description of the people of #Hellfire in Surah Humaza. #Reflect'

In [14]:
#remove duplicates where the target is not the same
df_train2 = df_train.copy()

for i in text_duplicates:
    if len(df_train2[df_train2["text"]==i]["target"].unique())>1:
        df_train2 = df_train2[df_train2["text"]!=i]

In [15]:
df_train2

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [16]:
stop_words = stopwords.words('english') # defining stop_words
#stop_words.remove('not') # removing not from the stop_words list as it contains value in negative movies
lemmatizer = WordNetLemmatizer()

In [17]:
def data_preprocessing(review):
    
  # data cleaning
  review = re.sub(re.compile('<.*?>'), '', review) #removing html tags
  #review =  re.sub('[^A-Za-z0-9]+', ' ', review) #taking only words
  
  # lowercase
  review = review.lower()
  
  # tokenization
  tokens = nltk.word_tokenize(review) # converts review to tokens
  
  # stop_words removal
  review = [word for word in tokens if word not in stop_words] #removing stop words
  
  # lemmatization
  review = [lemmatizer.lemmatize(word) for word in review]
  
  # join words in preprocessed review
  review = ' '.join(review)
  
  return review

In [18]:
df_train_text = df_train2.copy()
df_train_text.head(10)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,1
7,13,,,I'm on top of the hill and I can see a fire in...,1
8,14,,,There's an emergency evacuation happening now ...,1
9,15,,,I'm afraid that the tornado is coming to our a...,1


In [19]:
#replace missing values
df_train_text["keyword"] = df_train_text["keyword"].fillna("nan")
df_train_text["location"] = df_train_text["location"].fillna("nan")
df_test["keyword"] = df_test["keyword"].fillna("nan")


In [20]:
df_train_text.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7558 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7558 non-null   int64 
 1   keyword   7558 non-null   object
 2   location  7558 non-null   object
 3   text      7558 non-null   object
 4   target    7558 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 354.3+ KB


In [21]:
df_train_text['preprocessed_review'] = df_train_text['text'].apply(lambda review: data_preprocessing(review))

In [22]:
df_train_text

Unnamed: 0,id,keyword,location,text,target,preprocessed_review
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,deed reason # earthquake may allah forgive u
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask . canada
2,5,,,All residents asked to 'shelter in place' are ...,1,resident asked 'shelter place ' notified offic...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive # wildfire evacuation or..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,got sent photo ruby # alaska smoke # wildfire ...
...,...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1,two giant crane holding bridge collapse nearby...
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1,@ aria_ahrary @ thetawniest control wild fire ...
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,m1.94 [ 01:04 utc ] ? 5km volcano hawaii . htt...
7611,10872,,,Police investigating after an e-bike collided ...,1,police investigating e-bike collided car littl...


In [23]:
data = df_train_text[["preprocessed_review","target","keyword","location"]].copy()
y = data['target'].values
data.drop(['target'], axis=1, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.3, stratify=y, random_state = rndm)

print("Train data:",  X_train.shape, y_train.shape)
print("Test data:",  X_test.shape, y_test.shape)

Train data: (5290, 3) (5290,)
Test data: (2268, 3) (2268,)


In [24]:
vect = CountVectorizer(min_df=10)

X_train_review_bow = vect.fit_transform(X_train['preprocessed_review'])
X_test_review_bow = vect.transform(X_test['preprocessed_review'])

print('X_train_review_bow shape: ', X_train_review_bow.shape)
print('X_test_review_bow shape: ', X_test_review_bow.shape)

X_train_review_bow shape:  (5290, 1090)
X_test_review_bow shape:  (2268, 1090)


In [25]:
vectorizer = TfidfVectorizer(min_df=10)

X_train_review_tfidf = vectorizer.fit_transform(X_train['preprocessed_review'])
X_test_review_tfidf = vectorizer.transform(X_test['preprocessed_review'])

print('X_train_review_tfidf shape: ', X_train_review_tfidf.shape)
print('X_test_review_tfidf shape: ', X_test_review_tfidf.shape)

X_train_review_tfidf shape:  (5290, 1090)
X_test_review_tfidf shape:  (2268, 1090)


In [26]:
df_test_text = df_test[["text"]].copy()
df_test_text['preprocessed_review'] = df_test_text['text'].apply(lambda review: data_preprocessing(review))
data_test = df_test_text[["preprocessed_review"]].copy()
X_testsub_review_bow = vect.transform(data_test['preprocessed_review'])
X_testsub_review_tfidf = vectorizer.transform(data_test['preprocessed_review'])


In [27]:
X_train_bow=X_train_review_bow.toarray()
X_test_bow=X_test_review_bow.toarray()
X_testsub_bow = X_testsub_review_bow.toarray()

In [28]:
X_train_bow_df=pd.DataFrame(X_train_bow)
y_train_pd = pd.Series(y_train)

In [29]:
X_test_bow_df=pd.DataFrame(X_test_bow)
X_testsub_bow_df=pd.DataFrame(X_testsub_bow)

In [30]:
key_train = pd.get_dummies(data=X_train["keyword"], columns=["keyword"],drop_first=False, dtype=float).reset_index(drop=True)
key_test = pd.get_dummies(data=X_test["keyword"], columns=["keyword"],drop_first=False, dtype=float).reset_index(drop=True)
key_testsub = pd.get_dummies(data=df_test["keyword"], columns=["keyword"],drop_first=False, dtype=float).reset_index(drop=True)

In [31]:
# align test and train dataset
key_testsub,key_train=key_testsub.align(key_train,join='right',axis=1)
key_test,key_train=key_test.align(key_train,join='right',axis=1)



In [32]:
X_train_bow_df2 = pd.concat([X_train_bow_df, key_train.reindex(X_train_bow_df.index)], axis=1)
X_test_bow_df2 = pd.concat([X_test_bow_df, key_test.reindex(X_test_bow_df.index)], axis=1)
X_testsub_bow_df2 = pd.concat([X_testsub_bow_df, key_testsub.reindex(X_testsub_bow_df.index)], axis=1)


In [33]:
X_train_bow_df2.columns = X_train_bow_df2.columns.astype(str)

In [34]:
X_test_bow_df2.columns = X_test_bow_df2.columns.astype(str)

## Traditional Modeling

In [35]:
clf = LogisticRegression(penalty='l2')
clf.fit(X_train_bow_df2, y_train)

y_pred = clf.predict(X_test_bow_df2)
print('Test Accuracy: ', accuracy_score(y_test, y_pred))
print('Test f1: ', f1_score(y_test, y_pred))

Test Accuracy:  0.7971781305114638
Test f1:  0.7491821155943293


In [36]:
clf = MultinomialNB()
clf.fit(X_train_bow_df2, y_train)

y_pred = clf.predict(X_test_bow_df2) #prediction from model
print('Test Accuracy: ', accuracy_score(y_test, y_pred)) 


print('Test f1: ', f1_score(y_test, y_pred)) 

Test Accuracy:  0.8011463844797179
Test f1:  0.762005277044855


In [37]:
lgbm = LGBMClassifier()
lgbm.fit(X_train_bow_df2, y_train)

y_pred = lgbm.predict(X_test_bow_df2) #prediction from model
print('Test Accuracy: ', accuracy_score(y_test, y_pred)) 
print('Test f1: ', f1_score(y_test, y_pred))

Test Accuracy:  0.7923280423280423
Test f1:  0.7370184254606366


In [38]:
dtc2 = DecisionTreeClassifier()
dtc2.fit(X_train_bow_df2, y_train)

y_pred = dtc2.predict(X_test_bow_df2)
print('Test Accuracy: ', accuracy_score(y_test, y_pred))
print('Test f1: ', f1_score(y_test, y_pred)) 

Test Accuracy:  0.7544091710758377
Test f1:  0.7136246786632391


In [39]:
rfc = RandomForestClassifier()
rfc.fit(X_train_bow_df2, y_train)

y_pred = rfc.predict(X_test_bow_df2)
print('Test Accuracy: ', accuracy_score(y_test, y_pred))
print('Test f1: ', f1_score(y_test, y_pred)) 

Test Accuracy:  0.7892416225749559
Test f1:  0.7435622317596567


In [40]:
base_models_clas = [
    ('KNN', KNeighborsClassifier()),
    ('SVC',SVC()),
    ('Random Forest',RandomForestClassifier()),
    ('Logistic Regression',LogisticRegression()),
    ('XGboost',XGBClassifier()),
    ('Ridge', RidgeClassifierCV()),
    ]
stacked_clas = StackingClassifier(   
    estimators = base_models_clas,
    final_estimator = LogisticRegression(    max_iter=1000
),
    cv = 5,
    )

In [41]:
stacked_clas.fit(X_train_bow_df2, y_train)    
stacked_prediction = stacked_clas.predict(X_test_bow_df2)

In [42]:
print('Test Accuracy: ', accuracy_score(y_test, stacked_prediction))
print('Test f1: ', f1_score(y_test, stacked_prediction)) 

Test Accuracy:  0.8174603174603174
Test f1:  0.7727771679473107


In [43]:
base_models_clas2 = [
    ('KNN', KNeighborsClassifier()),
    ('SVC',SVC()),
    ('Random Forest',RandomForestClassifier()),
    ('Logistic Regression',LogisticRegression()),
    ('XGboost1',XGBClassifier()),
    ('XGboost2',XGBClassifier(max_depth = 7)),
    ('Ridge', RidgeClassifierCV()),
    ('GradientBoosting', GradientBoostingClassifier()),
    ('LGBM1',LGBMClassifier(max_depth = 3)),
    ('LGBM2',LGBMClassifier())

    ]
stacked_clas2 = StackingClassifier(   
    estimators = base_models_clas2,
    final_estimator = LogisticRegression(    max_iter=2000
),
    cv = 10,
    )

In [44]:
stacked_clas2.fit(X_train_bow_df2, y_train)    
stacked_prediction = stacked_clas2.predict(X_test_bow_df2)

In [45]:
print('Test Accuracy: ', accuracy_score(y_test, stacked_prediction))
print('Test f1: ', f1_score(y_test, stacked_prediction)) 

Test Accuracy:  0.814373897707231
Test f1:  0.7685541506322155


In [46]:
from numpy.random import seed
seed(4)
import tensorflow as tf
tf.random.set_seed(3)
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
#sc = StandardScaler()
#para hacer las curvas con los promedios no puede ser standarscaler
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error 
from sklearn.metrics import r2_score



In [47]:
NN_model = Sequential()
# The Input Layer :
NN_model.add(Dense(128, kernel_initializer='normal',input_dim = X_train_bow_df2.shape[1], activation='relu'))
# The Hidden Layers :
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))
# The Output Layer :
NN_model.add(Dense(1, kernel_initializer='normal',activation='sigmoid'))
# Compile the network :
NN_model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mse'])
NN_model.summary()

# con early stop:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=200)
mc = ModelCheckpoint('best_model.h5', monitor='val_loss', mode='min', verbose=1, save_best_only=True)

history1 = NN_model.fit(X_train_bow_df2, y_train, epochs=1000, batch_size=32, validation_data=(X_test_bow_df2, y_test),callbacks=[es, mc])
_, train_acc = NN_model.evaluate(X_train_bow_df2, y_train, verbose=0)
_, test_acc = NN_model.evaluate(X_test_bow_df2, y_test, verbose=0)
print('Train: %.3f, Test: %.3f' % (train_acc, test_acc))

wights_file = 'best_model.h5' # choose the best checkpoint 
NN_model.load_weights(wights_file) # load it
#NN_model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error'])
NN_model.compile(optimizer='adam')



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               168064    
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 256)               65792     
                                                                 
 dense_3 (Dense)             (None, 256)               65792     
                                                                 
 dense_4 (Dense)             (None, 1)                 257       
                                                                 
Total params: 332,929
Trainable params: 332,929
Non-trainable params: 0
_________________________________________________________________
Epoch 1/1000
Epoch 1: val_loss improved from inf t

In [48]:
predictions_train = NN_model.predict(X_train_bow_df2).round()




In [49]:
f1_score(y_train, predictions_train)

0.8422324232906498

In [50]:
predictions_test = NN_model.predict(X_test_bow_df2).round()
f1_score(y_test, predictions_test)




0.7229336437718276

In [51]:
NN_model = Sequential()
# The Input Layer :
NN_model.add(Dense(64, kernel_initializer='normal',input_dim = X_train_bow_df2.shape[1], activation='relu'))
# The Hidden Layers :
NN_model.add(Dense(128, kernel_initializer='normal',activation='relu'))
NN_model.add(Dense(128, kernel_initializer='normal',activation='relu'))
NN_model.add(Dense(128, kernel_initializer='normal',activation='relu'))
# The Output Layer :
NN_model.add(Dense(1, kernel_initializer='normal',activation='sigmoid'))
# Compile the network :
NN_model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mse'])
NN_model.summary()

# con early stop:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=20)
mc = ModelCheckpoint('best_model.h5', monitor='val_loss', mode='min', verbose=1, save_best_only=True)


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_5 (Dense)             (None, 64)                84032     
                                                                 
 dense_6 (Dense)             (None, 128)               8320      
                                                                 
 dense_7 (Dense)             (None, 128)               16512     
                                                                 
 dense_8 (Dense)             (None, 128)               16512     
                                                                 
 dense_9 (Dense)             (None, 1)                 129       
                                                                 
Total params: 125,505
Trainable params: 125,505
Non-trainable params: 0
_________________________________________________________________


In [52]:
history1 = NN_model.fit(X_train_bow_df2, y_train, epochs=100, batch_size=32, validation_data=(X_test_bow_df2, y_test),callbacks=[es, mc])
_, train_acc = NN_model.evaluate(X_train_bow_df2, y_train, verbose=0)
_, test_acc = NN_model.evaluate(X_test_bow_df2, y_test, verbose=0)
print('Train: %.3f, Test: %.3f' % (train_acc, test_acc))

wights_file = 'best_model.h5' # choose the best checkpoint 
NN_model.load_weights(wights_file) # load it
#NN_model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error'])
NN_model.compile(optimizer='adam')



Epoch 1/100
Epoch 1: val_loss improved from inf to 0.20991, saving model to best_model.h5
Epoch 2/100
Epoch 2: val_loss did not improve from 0.20991
Epoch 3/100
Epoch 3: val_loss did not improve from 0.20991
Epoch 4/100
Epoch 4: val_loss did not improve from 0.20991
Epoch 5/100
Epoch 5: val_loss did not improve from 0.20991
Epoch 6/100
Epoch 6: val_loss did not improve from 0.20991
Epoch 7/100
Epoch 7: val_loss did not improve from 0.20991
Epoch 8/100
Epoch 8: val_loss did not improve from 0.20991
Epoch 9/100
Epoch 9: val_loss did not improve from 0.20991
Epoch 10/100
Epoch 10: val_loss did not improve from 0.20991
Epoch 11/100
Epoch 11: val_loss did not improve from 0.20991
Epoch 12/100
Epoch 12: val_loss did not improve from 0.20991
Epoch 13/100
Epoch 13: val_loss did not improve from 0.20991
Epoch 14/100
Epoch 14: val_loss did not improve from 0.20991
Epoch 15/100
Epoch 15: val_loss did not improve from 0.20991
Epoch 16/100
Epoch 16: val_loss did not improve from 0.20991
Epoch 17/10

In [53]:
predictions_train = NN_model.predict(X_train_bow_df2).round()
f1_score(y_train, predictions_train)



0.7616161616161616

In [54]:
predictions_test = NN_model.predict(X_test_bow_df2).round()
f1_score(y_test, predictions_test)




0.7178247734138972

In [55]:
NN_model = Sequential()
# The Input Layer :
NN_model.add(Dense(32, kernel_initializer='normal',input_dim = X_train_bow_df2.shape[1], activation='relu'))
# The Hidden Layers :
NN_model.add(Dense(64, kernel_initializer='normal',activation='relu'))
NN_model.add(Dense(64, kernel_initializer='normal',activation='relu'))


# The Output Layer :
NN_model.add(Dense(1, kernel_initializer='normal',activation='sigmoid'))
# Compile the network :
NN_model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mse'])
NN_model.summary()

# con early stop:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=20)
mc = ModelCheckpoint('best_model2.h5', monitor='val_loss', mode='min', verbose=1, save_best_only=True)


Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_10 (Dense)            (None, 32)                42016     
                                                                 
 dense_11 (Dense)            (None, 64)                2112      
                                                                 
 dense_12 (Dense)            (None, 64)                4160      
                                                                 
 dense_13 (Dense)            (None, 1)                 65        
                                                                 
Total params: 48,353
Trainable params: 48,353
Non-trainable params: 0
_________________________________________________________________


In [56]:
history1 = NN_model.fit(X_train_bow_df2, y_train, epochs=100, batch_size=64, validation_data=(X_test_bow_df2, y_test),callbacks=[es, mc])
_, train_acc = NN_model.evaluate(X_train_bow_df2, y_train, verbose=0)
_, test_acc = NN_model.evaluate(X_test_bow_df2, y_test, verbose=0)
print('Train: %.3f, Test: %.3f' % (train_acc, test_acc))

wights_file = 'best_model2.h5' # choose the best checkpoint 
NN_model.load_weights(wights_file) # load it
#NN_model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error'])
NN_model.compile(optimizer='adam')



Epoch 1/100
Epoch 1: val_loss improved from inf to 0.31738, saving model to best_model2.h5
Epoch 2/100
Epoch 2: val_loss improved from 0.31738 to 0.20825, saving model to best_model2.h5
Epoch 3/100
Epoch 3: val_loss improved from 0.20825 to 0.20257, saving model to best_model2.h5
Epoch 4/100
Epoch 4: val_loss improved from 0.20257 to 0.19983, saving model to best_model2.h5
Epoch 5/100
Epoch 5: val_loss did not improve from 0.19983
Epoch 6/100
Epoch 6: val_loss did not improve from 0.19983
Epoch 7/100
Epoch 7: val_loss did not improve from 0.19983
Epoch 8/100
Epoch 8: val_loss did not improve from 0.19983
Epoch 9/100
Epoch 9: val_loss did not improve from 0.19983
Epoch 10/100
Epoch 10: val_loss did not improve from 0.19983
Epoch 11/100
Epoch 11: val_loss did not improve from 0.19983
Epoch 12/100
Epoch 12: val_loss did not improve from 0.19983
Epoch 13/100
Epoch 13: val_loss did not improve from 0.19983
Epoch 14/100
Epoch 14: val_loss did not improve from 0.19983
Epoch 15/100
Epoch 15: v

In [57]:
predictions_train = NN_model.predict(X_train_bow_df2).round()
f1_score(y_train, predictions_train)



0.8603880738286795

In [58]:
predictions_test = NN_model.predict(X_test_bow_df2).round()
f1_score(y_test, predictions_test)




0.7527593818984547

In [59]:
base_models_clas = [
    ('KNN', KNeighborsClassifier()),
    ('SVC',SVC()),
    ('Random Forest',RandomForestClassifier()),
    ('Logistic Regression',LogisticRegression()),
    ('XGboost1',XGBClassifier()),
    ('XGboost2',XGBClassifier(max_depth = 7)),
    ('Ridge', RidgeClassifierCV()),
    ('GradientBoosting1', GradientBoostingClassifier()),
    ('GradientBoosting2', GradientBoostingClassifier(max_depth = 7)),
    ('LGBM1',LGBMClassifier(max_depth = 10)),
    ('LGBM2',LGBMClassifier())
    ]
stacked_clas = StackingClassifier(   
    estimators = base_models_clas,
    final_estimator = LogisticRegression(    max_iter=2000
),
    cv = 10,
    )

In [60]:
stacked_clas.fit(X_train_bow_df2, y_train)    
stacked_prediction = stacked_clas.predict(X_test_bow_df2)

In [61]:
print('Test Accuracy: ', accuracy_score(y_test, stacked_prediction))
print('Test f1: ', f1_score(y_test, stacked_prediction)) 

Test Accuracy:  0.8148148148148148
Test f1:  0.7692307692307692


## BERT

In [62]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [63]:
train_texts, val_texts, train_labels, val_labels = train_test_split(df_train2['text'].tolist(), df_train2['target'].tolist(), test_size=0.2, random_state=42)

In [64]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}

In [65]:
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
            outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
            pooled_output = outputs.pooler_output
            x = self.dropout(pooled_output)
            logits = self.fc(x)
            return logits

In [66]:
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

In [67]:
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

In [68]:
def predict_sentiment(text, model, tokenizer, device, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
    return "positive" if preds.item() == 1 else "negative"

In [69]:
# Set up parameters
bert_model_name = 'bert-base-uncased'
num_classes = 2
max_length = 128
batch_size = 8
num_epochs = 4
learning_rate = 2e-5

In [70]:
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 9.61MB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading (…)okenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<?, ?B/s]
Downloading (…)lve/main/config.json: 100%|██████████| 570/570 [00:00<00:00, 571kB/s]


In [71]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(bert_model_name, num_classes).to(device)

Downloading model.safetensors: 100%|██████████| 440M/440M [00:45<00:00, 9.59MB/s] 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [72]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_texts) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



In [73]:
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train(model, train_dataloader, optimizer, scheduler, device)
    accuracy, report = evaluate(model, val_dataloader, device)
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(report)

Epoch 1/4
Validation Accuracy: 0.8585
              precision    recall  f1-score   support

           0       0.85      0.92      0.88       867
           1       0.87      0.78      0.82       645

    accuracy                           0.86      1512
   macro avg       0.86      0.85      0.85      1512
weighted avg       0.86      0.86      0.86      1512

Epoch 2/4
Validation Accuracy: 0.8446
              precision    recall  f1-score   support

           0       0.83      0.93      0.87       867
           1       0.88      0.74      0.80       645

    accuracy                           0.84      1512
   macro avg       0.85      0.83      0.84      1512
weighted avg       0.85      0.84      0.84      1512

Epoch 3/4
Validation Accuracy: 0.8446
              precision    recall  f1-score   support

           0       0.83      0.91      0.87       867
           1       0.86      0.76      0.81       645

    accuracy                           0.84      1512
   macro avg  

In [74]:
torch.save(model.state_dict(), "bert_classifier.pth")


In [75]:
pred_test = []

for i in df_test['text'].tolist():
    res = predict_sentiment(i, model, tokenizer, device)
    pred_test.append(res)

In [76]:
# Set up parameters
bert_model_name = 'bert-base-uncased'
num_classes = 2
max_length = 157
batch_size = 8
num_epochs = 4
learning_rate = 5e-6

In [77]:
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

In [78]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(bert_model_name, num_classes).to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [79]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_texts) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



In [81]:
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train(model, train_dataloader, optimizer, scheduler, device)
    accuracy, report = evaluate(model, val_dataloader, device)
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(report)

Epoch 1/4
Validation Accuracy: 0.8492
              precision    recall  f1-score   support

           0       0.85      0.90      0.87       867
           1       0.85      0.79      0.82       645

    accuracy                           0.85      1512
   macro avg       0.85      0.84      0.84      1512
weighted avg       0.85      0.85      0.85      1512

Epoch 2/4
Validation Accuracy: 0.8519
              precision    recall  f1-score   support

           0       0.84      0.91      0.88       867
           1       0.87      0.77      0.82       645

    accuracy                           0.85      1512
   macro avg       0.86      0.84      0.85      1512
weighted avg       0.85      0.85      0.85      1512

Epoch 3/4
Validation Accuracy: 0.8552
              precision    recall  f1-score   support

           0       0.85      0.91      0.88       867
           1       0.86      0.79      0.82       645

    accuracy                           0.86      1512
   macro avg  

In [82]:
torch.save(model.state_dict(), "bert_classifier.pth")

In [83]:
pred_test2 = []

for i in df_test['text'].tolist():
    res = predict_sentiment(i, model, tokenizer, device)
    pred_test2.append(res)