In [1]:
import pandas as pd

In [2]:
!pip install fasttext



In [3]:
df = pd.read_csv('/content/data-train.csv')
X =  df.drop('Sentiment', axis=1)
y = df['Sentiment']
X = X.values
y = y.values

In [4]:
# count frequency of each label
from collections import Counter
label_counts = Counter(y)
label_counts

Counter({1: 27084, 2: 79064, 3: 32714, 4: 9160, 0: 7026})

In [5]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(sampling_strategy={1: label_counts[1] // 3, 2: label_counts[2] // 9, 3: label_counts[3] // 4})
X_resampled, y_resampled = rus.fit_resample(X, y)
label_counts = Counter(y_resampled)
label_counts

Counter({0: 7026, 1: 9028, 2: 8784, 3: 8178, 4: 9160})

In [6]:
# i will only keep the sentence as the train dataset and remove other two columns
X_resampled = X_resampled[:, 2]

In [7]:
# do preprocessing
# make them lowercase
X_resampled = [sentence.lower() for sentence in X_resampled]
# remove punctuation
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
X_resampled = [tokenizer.tokenize(sentence) for sentence in X_resampled]
# remove stopwords
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
X_resampled = [[word for word in sentence if word not in stop_words] for sentence in X_resampled]
# do lemmatization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
X_resampled = [[lemmatizer.lemmatize(word) for word in sentence] for sentence in X_resampled]


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=41)

In [9]:
# Convert the preprocessed sentences into numerical features using TF-IDF vectorization
# this is the approcach 1 mentioned in the doc
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features = 600)
X_train_tfidf = vectorizer.fit_transform([' '.join(sentence) for sentence in X_train])
X_test_tfidf = vectorizer.transform([' '.join(sentence) for sentence in X_test])
X_validation_tfidf = vectorizer.transform([' '.join(sentence) for sentence in X_val])

In [10]:
# now i want to use fasttext to do feature extraction and get embeddings for a sentence
# this the approach 2 mentioned in the doc
import fasttext
with open('train.txt', 'w') as f:
    for sentence in X_train:
        f.write(' '.join(sentence) + '\n')

params = {
    'lr': 0.1,
    'epoch': 15,
    'dim': 500,
    'ws': 5,
    'minCount': 5,
    'neg': 5
}

# Train the FastText model
model = fasttext.train_unsupervised('train.txt', **params)

# Use the trained FastText model to get sentence embeddings
X_train_fasttext = [model.get_sentence_vector(' '.join(sentence)) for sentence in X_train]
X_test_fasttext = [model.get_sentence_vector(' '.join(sentence)) for sentence in X_test]
X_validation_fasttext = [model.get_sentence_vector(' '.join(sentence)) for sentence in X_val]


In [11]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.utils import to_categorical

y_train_encoded = to_categorical(y_train)
y_val_encoded = to_categorical(y_val)
y_test_encoded = to_categorical(y_test)
X_train_tfidf = X_train_tfidf.toarray()
X_train_fasttext = np.array(X_train_fasttext)
X_validation_tfidf = X_validation_tfidf.toarray()
X_validation_fasttext = np.array(X_validation_fasttext)
X_test_tfidf = X_test_tfidf.toarray()
X_test_fasttext = np.array(X_test_fasttext)


# Define the neural network model for tfidf
neural_net_tfidf = Sequential()
neural_net_tfidf.add(Dense(256, activation='relu', input_shape=(X_train_tfidf.shape[1],)))
neural_net_tfidf.add(BatchNormalization())
neural_net_tfidf.add(Dropout(0.5))
neural_net_tfidf.add(Dense(128, activation='relu'))
neural_net_tfidf.add(BatchNormalization())
neural_net_tfidf.add(Dropout(0.5))
neural_net_tfidf.add(Dense(64, activation='relu'))
neural_net_tfidf.add(BatchNormalization())
neural_net_tfidf.add(Dropout(0.5))
neural_net_tfidf.add(Dense(5, activation='softmax'))
neural_net_tfidf.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
neural_net_tfidf.fit(X_train_tfidf, y_train_encoded, validation_data=(X_validation_tfidf, y_val_encoded), epochs=40)


Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x7f93208ad480>

In [12]:
# Define the neural network model for fasttext
neural_net_fasttext = Sequential()
neural_net_fasttext.add(Dense(256, activation='relu', input_shape=(X_train_fasttext.shape[1],)))
neural_net_fasttext.add(BatchNormalization())
neural_net_fasttext.add(Dropout(0.5))

neural_net_fasttext.add(Dense(128, activation='relu'))
neural_net_fasttext.add(BatchNormalization())
neural_net_fasttext.add(Dropout(0.5))

neural_net_fasttext.add(Dense(64, activation='relu'))
neural_net_fasttext.add(BatchNormalization())
neural_net_fasttext.add(Dropout(0.5))

neural_net_fasttext.add(Dense(32, activation='relu'))
neural_net_fasttext.add(BatchNormalization())
neural_net_fasttext.add(Dropout(0.5))

neural_net_fasttext.add(Dense(16, activation='relu'))
neural_net_fasttext.add(BatchNormalization())
neural_net_fasttext.add(Dropout(0.5))

neural_net_fasttext.add(Dense(5, activation='softmax'))

neural_net_fasttext.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
neural_net_fasttext.fit(X_train_fasttext, y_train_encoded, validation_data=(X_validation_fasttext, y_val_encoded), epochs=40)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x7f91bd62e470>

In [13]:
accuracy_tfidf = neural_net_tfidf.evaluate(X_test_tfidf, y_test_encoded)[1]
print(f'accuracy of tfidf: {accuracy_tfidf}')
accuracy_fasttext = neural_net_fasttext.evaluate(X_test_fasttext, y_test_encoded)[1]
print(f'accuracy of fasttext: {accuracy_fasttext}')

accuracy of tfidf: 0.45388808846473694
accuracy of fasttext: 0.5013039112091064


In [14]:
# calculate precision
from sklearn.metrics import precision_score
precision_tfidf = precision_score(y_test_encoded.argmax(axis=1), neural_net_tfidf.predict(X_test_tfidf).argmax(axis=1), average='macro')
precision_fasttext = precision_score(y_test_encoded.argmax(axis=1), neural_net_fasttext.predict(X_test_fasttext).argmax(axis=1), average='macro')
print(f'precision of tfidf: {precision_tfidf}')
print(f'precision of fasttext: {precision_fasttext}')

precision of tfidf: 0.47579289187750906
precision of fasttext: 0.47989077035612926


In [15]:
# calculate recall
from sklearn.metrics import recall_score
recall_tfidf = recall_score(y_test_encoded.argmax(axis=1), neural_net_tfidf.predict(X_test_tfidf).argmax(axis=1), average='macro')
recall_fasttext = recall_score(y_test_encoded.argmax(axis=1), neural_net_fasttext.predict(X_test_fasttext).argmax(axis=1), average='macro')
print(f'recall of tfidf: {recall_tfidf}')
print(f'recall of fasttext: {recall_fasttext}')


recall of tfidf: 0.4508729970051754
recall of fasttext: 0.5018820367573813


In [16]:
# calculate f1-score
from sklearn.metrics import f1_score
f1_tfidf = f1_score(y_test_encoded.argmax(axis=1), neural_net_tfidf.predict(X_test_tfidf).argmax(axis=1), average='macro')
f1_fasttext = f1_score(y_test_encoded.argmax(axis=1), neural_net_fasttext.predict(X_test_fasttext).argmax(axis=1), average='macro')
print(f'f1-score of tfidf: {f1_tfidf}')
print(f'f1-score of fasttext: {f1_fasttext}')

f1-score of tfidf: 0.44403380655326086
f1-score of fasttext: 0.47823633986779424


In [17]:
# normalized confusion matrix
from sklearn.metrics import confusion_matrix
confusion_tfidf = confusion_matrix(y_test_encoded.argmax(axis=1), neural_net_tfidf.predict(X_test_tfidf).argmax(axis=1))
confusion_fasttext = confusion_matrix(y_test_encoded.argmax(axis=1), neural_net_fasttext.predict(X_test_fasttext).argmax(axis=1))
confusion_tfidf_normalized = confusion_tfidf / confusion_tfidf.sum(axis=1)[:, np.newaxis]
confusion_fasttext_normalized = confusion_fasttext / confusion_fasttext.sum(axis=1)[:, np.newaxis]
print(f'normalized confusion matrix of tfidf:\n {confusion_tfidf_normalized}')
print(f'\nnormalized confusion matrix of fasttext:\n {confusion_fasttext_normalized}')

normalized confusion matrix of tfidf:
 [[0.43784153 0.2260929  0.26775956 0.03551913 0.03278689]
 [0.1615938  0.31045932 0.42280022 0.0647482  0.04039845]
 [0.03195816 0.131319   0.71179547 0.08192911 0.04299826]
 [0.02216749 0.0862069  0.41133005 0.22229064 0.25800493]
 [0.01373626 0.03956044 0.18516484 0.18956044 0.57197802]]

normalized confusion matrix of fasttext:
 [[0.69535519 0.20969945 0.03620219 0.01912568 0.03961749]
 [0.39679026 0.33204206 0.15495296 0.05810736 0.05810736]
 [0.08367228 0.21499128 0.49099361 0.12144102 0.0889018 ]
 [0.05972906 0.13485222 0.17426108 0.19211823 0.43903941]
 [0.01868132 0.05274725 0.02637363 0.1032967  0.7989011 ]]


Cells below this cell are related to kaggle competition

In [18]:
test_df = pd.read_csv('/content/pr-test-data.csv')
X_test_df = test_df['Phrase'].values
ID_column = test_df['ID'].values

In [19]:
# do preprocessing
# make them lowercase
X_test_df = [sentence.lower() for sentence in X_test_df]
# remove punctuation
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
X_test_df = [tokenizer.tokenize(sentence) for sentence in X_test_df]
# remove stopwords
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
X_test_df = [[word for word in sentence if word not in stop_words] for sentence in X_test_df]
# do lemmatization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
X_test_df = [[lemmatizer.lemmatize(word) for word in sentence] for sentence in X_test_df]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [20]:
X_test_df_tfidf = vectorizer.transform([' '.join(sentence) for sentence in X_test_df])
X_test_df_tfidf = X_test_df_tfidf.toarray()
y_pred_tfidf = neural_net_tfidf.predict(X_test_df_tfidf).argmax(axis=1)



In [21]:
submission_df_tfidf = pd.DataFrame({'Sentiment': y_pred_tfidf, 'ID':ID_column})
submission_df_tfidf.to_csv('submission_tfidf.csv', index=False)

In [22]:
X_test_df_fasttext = [model.get_sentence_vector(' '.join(sentence)) for sentence in X_test_df]
X_test_df_fasttext = np.array(X_test_df_fasttext)
y_pred_fasttext = neural_net_fasttext.predict(X_test_df_fasttext).argmax(axis=1)



In [23]:
submission_df_fasttext = pd.DataFrame({'Sentiment': y_pred_fasttext, 'ID':ID_column})
submission_df_fasttext.to_csv('submission_fasttext.csv', index=False)