<a href="https://colab.research.google.com/github/krizchellewong/Sentiment-Analysis/blob/main/SentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Mount Google Drive
from google.colab import drive, data_table
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import csv


train_path = "/content/drive/My Drive/NLP1000/train.csv"

train_df = pd.read_csv(train_path, quoting=csv.QUOTE_ALL, encoding="utf-8", index_col=0)
print("Number of training Documents: %d" % train_df.shape[0])
print("Positive: %d" % (train_df['IsPositive'] == True).sum())
print("Negative: %d" % (train_df['IsPositive'] == False).sum())

# View dataset to check what to pre-process
print(train_df[:20])

Number of training Documents: 105846
Positive: 52923
Negative: 52923
                                                         Text  IsPositive
DocumentId                                                               
2388179                                    never again 🌵 LINK        True
657251      ma okey rang tanan promise naas lord dita nya ...        True
1730789                            MENTION yawaaa gyud oh ?🖕😂        True
868789      bahalag mangalata ang all, dili jud ko mag gma...       False
1570427          makatambok gyud diay ang wholeday na tulog 😔       False
921483      grabe tumawad mga naorder hahaha sis yung tubo...        True
2234531                              good morning mga sawii 💦        True
199387      stormy tuesday! braving pepito! keep safe and ...        True
463795      aaahhh success yung pagconvince ko kay mama at...       False
370398      a smart villain per se, but acting stupid and ...       False
359918      time check 2:37 am medyo nakaka

In [None]:
test_path = "/content/drive/My Drive/NLP1000/test.csv"

test_df = pd.read_csv(test_path, quoting=csv.QUOTE_ALL, encoding="utf-8")

test_id = test_df['DocumentId'].values


In [None]:
# Pre-processing and feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer
import re

# Turning the data column into an array
x_train = train_df['Text'].values
x_test = test_df['Text'].values
y_train = train_df['IsPositive'].values

# Pre-processing steps considered: removal of stop words, lowercasing of text (automatically done by scikit's vectorizers according to documentation)
#                                  and removal of certain punctuation marks that do not necessarily convey emotion
x_train = [re.sub(r'["$%&\'\[\]()*+,-./;<=>@^_`{}#]','', i) for i in x_train]
x_test = [re.sub(r'["$%&\'\[\]()*+,-./;<=>@^_`{}#]','', i) for i in x_test]

vectorizer = TfidfVectorizer()

x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

clf = LogisticRegression(max_iter=1000)
clf.fit(x_train, y_train)

prediction = clf.predict(x_test)

df = pd.DataFrame({'DocumentId': test_id, 'IsPositive': prediction})

df.to_csv('submission.csv', quoting=csv.QUOTE_ALL, index=False)