In [1]:
# Exploratory Data Analysis
import pandas as pd

In [2]:
# Load the dataset
df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [3]:
# Check the size of the dataset
print('Number of rows:', len(df))
print('Number of rows:', len(test_df))

Number of rows: 1048575
Number of rows: 359


In [4]:
# Check the sentiment distribution of the dataset
print(df['Sentiment'].value_counts())
print(test_df['Sentiment'].value_counts())

0    800000
1    248575
Name: Sentiment, dtype: int64
1    182
0    177
Name: Sentiment, dtype: int64


In [5]:
# Check if there are any missing values
print(df.isnull().sum())
print(test_df.isnull().sum())

Index        0
Sentiment    0
Text         0
dtype: int64
Index        0
Sentiment    0
Text         0
dtype: int64


In [6]:
# Text Preprocessing
import re
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# Download stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /Users/julia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def clean_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove numbers and special characters
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)

    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Split the text into individual words
    words = text.split()
    
    # Remove stop words from the text
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word.casefold() not in stop_words]

    # Join the filtered words back into a single string
    filtered_text = ' '.join(filtered_words)

    return filtered_text

# Apply the preprocessing function to the text column
df['text'] = df['Text'].apply(clean_text)
test_df['text'] = test_df['Text'].apply(clean_text)

In [None]:
# Linguistic Feature Extraction
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec

In [None]:
# Bag-of-words
count_vect = CountVectorizer()
bow = count_vect.fit_transform(df['text'])

In [None]:
# TF-IDF
tfidf_vect = TfidfVectorizer()
tfidf = tfidf_vect.fit_transform(df['text'])

In [None]:
# Word2Vec
import numpy as np
sentences = [text.split() for text in df['text']]
word2vec_model = Word2Vec(sentences, window=5, min_count=1)
word2vec_X = []
for sentence in sentences:
    sentence_vec = np.zeros((100,))
    for word in sentence:
        if word in word2vec_model.wv:
            sentence_vec += word2vec_model.wv[word]
    word2vec_X.append(sentence_vec)
word2vec_X = np.array(word2vec_X)

In [None]:
# Sentiment Classification Model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

In [None]:
print(df.head(4))

In [None]:
# Split the data into training and testing sets

X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(bow, df['Sentiment'], test_size=0.2, random_state=42)
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(tfidf, df['Sentiment'], test_size=0.2, random_state=42)
X_train_word2vec, X_test_word2vec, y_train_word2vec, y_test_word2vec = train_test_split(word2vec_X, df['Sentiment'], test_size=0.2, random_state=42)

In [None]:
# Logistic Regression
lr_bow = LogisticRegression(max_iter=1000000)
lr_bow.fit(X_train_bow, y_train_bow)

lr_tfidf = LogisticRegression(max_iter=1000000)
lr_tfidf.fit(X_train_tfidf, y_train_tfidf)

In [None]:
# SVM classifier
svm_bow = SVC()
svm_bow.fit(X_train_bow, y_train_bow)

svm_tfidf = SVC()
svm_tfidf.fit(X_train_tfidf, y_train_tfidf)

In [None]:
# Naive Bayes classifier
nbc_bow = MultinomialNB()
nbc_bow.fit(X_train_bow, y_train_bow)

nbc_tfidf = MultinomialNB()
nbc_tfidf.fit(X_train_tfidf, y_train_tfidf)

In [None]:
# Random Forest classifier
rfc_bow = RandomForestClassifier()
rfc_bow.fit(X_train_bow, y_train_bow)

rfc_tfidf = RandomForestClassifier()
rfc_tfidf.fit(X_train_tfidf, y_train_tfidf)

In [None]:
# MLP model
from keras.models import Sequential
from keras.layers import Dense, Dropout

# create MLP model
mlp_model = Sequential()
mlp_model.add(Dense(128, activation='relu', input_dim=X_train_w2v.shape[1]))
mlp_model.add(Dropout(0.2))
mlp_model.add(Dense(1, activation='sigmoid'))

# compile model
mlp_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# fit model
mlp_model.fit(X_train_word2vec, y_train_word2vec, epochs=10, batch_size=32, validation_split=0.1)

# LSTM model
from keras.layers import LSTM, Embedding

# create LSTM model
lstm_model = Sequential()
lstm_model.add(Embedding(input_dim=vocab_size, output_dim=100, input_length=max_length))
lstm_model.add(LSTM(units=64))
lstm_model.add(Dense(1, activation='sigmoid'))

# compile model
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# fit model
lstm_model.fit(X_train_word2vec, y_train_word2vec, epochs=10, batch_size=32, validation_split=0.1)


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Make predictions on the testing set
y_pred_lr_bow = lr_bow.predict(X_test_bow)

# Calculate evaluation metrics
print('Logistic Regression Bag of Words')
print('Accuracy:', accuracy_score(y_test_bow, y_pred_lr_bow))
print('Precision:', precision_score(y_test_bow, y_pred_lr_bow, average='weighted'))
print('Recall:', recall_score(y_test_bow, y_pred_lr_bow, average='weighted'))
print('F1-score:', f1_score(y_test_bow, y_pred_lr_bow, average='weighted'))

In [None]:
# Make predictions on the testing set
y_pred_lr_tfidf = lr_tfidf.predict(X_test_tfidf)

# Calculate evaluation metrics
print('Logistic Regression TF-IDF')
print('Accuracy:', accuracy_score(y_test_tfidf, y_pred_lr_tfidf))
print('Precision:', precision_score(y_test_tfidf, y_pred_lr_tfidf, average='weighted'))
print('Recall:', recall_score(y_test_tfidf, y_pred_lr_tfidf, average='weighted'))
print('F1-score:', f1_score(y_test_tfidf, y_pred_lr_tfidf, average='weighted'))

In [None]:
# Make predictions on the testing set
y_pred_svm_bow = svm_bow.predict(X_test_bow)

# Calculate evaluation metrics
print('SVM classifier Bag of Words')
print('Accuracy:', accuracy_score(y_test_bow, y_pred_svm_bow))
print('Precision:', precision_score(y_test_bow, y_pred_svm_bow, average='weighted'))
print('Recall:', recall_score(y_test_bow, y_pred_svm_bow, average='weighted'))
print('F1-score:', f1_score(y_test_bow, y_pred_svm_bow, average='weighted'))

In [None]:
# Make predictions on the testing set
y_pred_svm_tfidf = svm_tfidf.predict(X_test_tfidf)

# Calculate evaluation metrics
print('SVM classifier TF-IDF')
print('Accuracy:', accuracy_score(y_test_tfidf, y_pred_svm_tfidf))
print('Precision:', precision_score(y_test_tfidf, y_pred_svm_tfidf, average='weighted'))
print('Recall:', recall_score(y_test_tfidf, y_pred_svm_tfidf, average='weighted'))
print('F1-score:', f1_score(y_test_tfidf, y_pred_svm_tfidf, average='weighted'))

In [None]:
# Make predictions on the testing set
y_pred_nbc_bow = nbc_bow.predict(X_test_bow)

# Calculate evaluation metrics
print('Naive Bayes Bag of Words')
print('Accuracy:', accuracy_score(y_test_bow, y_pred_nbc_bow))
print('Precision:', precision_score(y_test_bow, y_pred_nbc_bow, average='weighted'))
print('Recall:', recall_score(y_test_bow, y_pred_nbc_bow, average='weighted'))
print('F1-score:', f1_score(y_test_bow, y_pred_nbc_bow, average='weighted'))

In [None]:
# Make predictions on the testing set
y_pred_nbc_tfidf = nbc_tfidf.predict(X_test_tfidf)

# Calculate evaluation metrics
print('Naive Bayes TF-IDF')
print('Accuracy:', accuracy_score(y_test_tfidf, y_pred_nbc_tfidf))
print('Precision:', precision_score(y_test_tfidf, y_pred_nbc_tfidf, average='weighted'))
print('Recall:', recall_score(y_test_tfidf, y_pred_nbc_tfidf, average='weighted'))
print('F1-score:', f1_score(y_test_tfidf, y_pred_nbc_tfidf, average='weighted'))

In [None]:
# Make predictions on the testing set
y_pred_rfc_bow = rfc_bow.predict(X_test_bow)

# Calculate evaluation metrics
print('Random Forest Bag of Words')
print('Accuracy:', accuracy_score(y_test_bow, y_pred_rfc_bow))
print('Precision:', precision_score(y_test_bow, y_pred_rfc_bow, average='weighted'))
print('Recall:', recall_score(y_test_bow, y_pred_rfc_bow, average='weighted'))
print('F1-score:', f1_score(y_test_bow, y_pred_rfc_bow, average='weighted'))

In [None]:
# Make predictions on the testing set
y_pred_rfc_tfidf = rfc_tfidf.predict(X_test_tfidf)

# Calculate evaluation metrics
print('Random Forest TF-IDF')
print('Accuracy:', accuracy_score(y_test_tfidf, y_pred_rfc_tfidf))
print('Precision:', precision_score(y_test_tfidf, y_pred_rfc_tfidf, average='weighted'))
print('Recall:', recall_score(y_test_tfidf, y_pred_rfc_tfidf, average='weighted'))
print('F1-score:', f1_score(y_test_tfidf, y_pred_rfc_tfidf, average='weighted'))

In [None]:
# Make predictions on the testing set
y_pred_mlp = mlp_model.predict(X_test_word2vec)
y_pred_lstm = lstm_model.predict_classes(X_test_word2vec)

# Calculate evaluation metrics
print('MLP - multilayer perceptron - Word2Vec')
print('Accuracy:', accuracy_score(y_test_word2vec, y_pred_mlp))
print('Precision:', precision_score(y_test_word2vec, y_pred_mlp, average='weighted'))
print('Recall:', recall_score(y_test_word2vec, y_pred_mlp, average='weighted'))
print('F1-score:', f1_score(y_test_word2vec, y_pred_mlp, average='weighted'))

print('LSTM - long short-term memory - Word2Vec')
print('Accuracy:', accuracy_score(y_test_word2vec, y_pred_lstm))
print('Precision:', precision_score(y_test_word2vec, y_pred_lstm, average='weighted'))
print('Recall:', recall_score(y_test_word2vec, y_pred_lstm, average='weighted'))
print('F1-score:', f1_score(y_test_word2vec, y_pred_lstm, average='weighted'))