In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd

In [3]:
train = pd.read_csv('/content/drive/MyDrive/2020 WINTER PROJ/input/train.csv')
test = pd.read_csv('/content/drive/MyDrive/2020 WINTER PROJ/input/test.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27481 entries, 0 to 27480
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   textID         27481 non-null  object
 1   text           27480 non-null  object
 2   selected_text  27480 non-null  object
 3   sentiment      27481 non-null  object
dtypes: object(4)
memory usage: 858.9+ KB


In [4]:
train.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [5]:
train.dropna(inplace=True)

In [6]:
# Remove weird spaces
def remove_space(text):
    text = text.strip()
    text = text.split()
    return " ".join(text)
train['text'] = train['text'].apply(lambda x:remove_space(str(x)))

In [7]:
# Convert text into lowercase
train['text'] = train['text'].apply(lambda x:str(x).lower())

In [8]:
# Download WordNet data
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [9]:
import re
from nltk.corpus import wordnet

# Correct the words with repeated characters
# ref: https://www.youtube.com/watch?v=r37OYsdH6Z8
class RepeatReplacer:
  def __init__(self):
    self.regex = re.compile(r'(\w*)(\w)\2(\w*)')
    self.repl = r'\1\2\3'

  def replace(self, word):
    if wordnet.synsets(word):
      return word
    loop_res = self.regex.sub(self.repl, word)
    if (word == loop_res):
      return loop_res
    else:
      return self.replace(loop_res)

In [10]:
# Testing RepeatReplacer
replacer = RepeatReplacer()
replace_result = replacer.replace('Sweeeeeeeet')

print(replace_result)

Sweet


In [11]:
replacer = RepeatReplacer()
train.text = train.text.apply(lambda x:' '.join([replacer.replace(word) for word in str(x).split()]))

In [12]:
from nltk.stem.wordnet import WordNetLemmatizer
lmtzr = WordNetLemmatizer()
train.text = train.text.apply(lambda x:' '.join([lmtzr.lemmatize(word, 'v') for word in str(x).split()]))

In [13]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [15]:
from nltk.corpus import stopwords
def remove_stopword(x):
    return [y for y in x if y not in stopwords.words('english')]
train.text = train.text.apply(lambda x:' '.join(remove_stopword(str(x).split())))

In [16]:
contraction = {
    "'cause": 'because',
    ',cause': 'because',
    ';cause': 'because',
    "ain't": 'am not',
    'ain,t': 'am not',
    'ain;t': 'am not',
    'ain`t': 'am not',
    'aint': 'am not',
    'arent': 'are not',
    "aren't": 'are not',
    'are;t': 'are not',
    
}

Prediction by TextCategorizer in spaCy

In [17]:
test.head(20)

Unnamed: 0,textID,text,sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative
3,01082688c6,happy bday!,positive
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive
5,726e501993,that`s great!! weee!! visitors!,positive
6,261932614e,I THINK EVERYONE HATES ME ON HERE lol,negative
7,afa11da83f,"soooooo wish i could, but im in school and my...",negative
8,e64208b4ef,and within a short time of the last clue all ...,neutral
9,37bcad24ca,What did you get? My day is alright.. haven`...,neutral


In [18]:
# Pre-processing

test['text'] = test['text'].apply(lambda x:remove_space(str(x)))
test['text'] = test['text'].apply(lambda x:str(x).lower())
test.text = test.text.apply(lambda x:' '.join([replacer.replace(word) for word in str(x).split()]))
test.text = test.text.apply(lambda x:' '.join([lmtzr.lemmatize(word, 'v') for word in str(x).split()]))
test.text = test.text.apply(lambda x:' '.join(remove_stopword(str(x).split())))

In [19]:
import spacy

nlp = spacy.blank('en')
print("Created blank 'en' model")

Created blank 'en' model


In [20]:
textcat = nlp.create_pipe(
                'textcat',
                config={
                    'exclusive_classes': True,
                    'architecture': 'bow'
                })
nlp.add_pipe(textcat)

In [21]:
textcat.add_label('positive')
textcat.add_label('neutral')
textcat.add_label('negative')

1

In [22]:
train_texts = train.text.values
train_labels = [{'cats': {'positive': label == 'positive',
                        'negative': label == 'negative'}}
               for label in train.sentiment]

In [23]:
train_data = list(zip(train_texts, train_labels))

In [24]:
# Training (epoch = 10)
from spacy.util import minibatch
import random

random.seed(1)
spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()

losses = {}
for epoch in range(10):
    random.shuffle(train_data)
    batches = minibatch(train_data, size=8)
    for batch in batches:
        texts, labels = zip(*batch)
        nlp.update(texts, labels, sgd=optimizer, losses=losses)
    print(losses)

{'textcat': 16.291474719182588}
{'textcat': 28.46322232997045}
{'textcat': 39.11107330862433}
{'textcat': 48.78966702395701}
{'textcat': 57.76399508112809}
{'textcat': 66.19758988160174}
{'textcat': 74.1716257025837}
{'textcat': 81.7747914660431}
{'textcat': 89.04830890125595}
{'textcat': 96.03549531319004}


In [25]:
# Prediction
docs = [nlp.tokenizer(text) for text in test.text]

textcat = nlp.get_pipe('textcat')
scores, _ = textcat.predict(docs)

# 'scores' indicates the probability the input text belongs to the classes
print(scores)

[[2.6037344e-01 5.8854884e-01 1.5107772e-01]
 [9.9914968e-01 7.5942063e-04 9.0901711e-05]
 [6.2726613e-04 3.4488219e-01 6.5449053e-01]
 ...
 [1.6433069e-02 2.5297382e-01 7.3059309e-01]
 [2.7196190e-01 7.2349709e-01 4.5410120e-03]
 [9.2816174e-01 7.0460327e-02 1.3778904e-03]]


In [26]:
predicted_labels = scores.argmax(axis=1)
prediction = pd.Series([textcat.labels[label] for label in predicted_labels])
print(prediction)

0        neutral
1       positive
2       negative
3       positive
4       positive
          ...   
3529    negative
3530     neutral
3531    negative
3532     neutral
3533    positive
Length: 3534, dtype: object


In [27]:
correct = prediction == test.sentiment
score = correct.sum()/correct.size
print(score)

0.6994906621392191
