# Text Classification
## Using vector semantics, we can easily convert a given text into its corresponding vector form. Given any text, first pre- process the text and convert it into a vector using BoW methods. Given this vector, implement your own classifier to classify the vector is predefined categories

In [51]:
# python libraries for data manipulation and math
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [22]:
train_data= pd.read_csv('dataset/Corona_NLP_train.csv', encoding="latin1")
train_data

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative
...,...,...,...,...,...,...
41152,44951,89903,"Wellington City, New Zealand",14-04-2020,Airline pilots offering to stock supermarket s...,Neutral
41153,44952,89904,,14-04-2020,Response to complaint not provided citing COVI...,Extremely Negative
41154,44953,89905,,14-04-2020,You know itÂs getting tough when @KameronWild...,Positive
41155,44954,89906,,14-04-2020,Is it wrong that the smell of hand sanitizer i...,Neutral


In [14]:
test_data= pd.read_csv('dataset/Corona_NLP_test.csv', encoding="latin1")
test_data.shape

(3798, 6)

## Dripping the unwanted features so that we can focus on relavent features

In [23]:
unrelevant_features = ["UserName", "ScreenName", "Location", "TweetAt"]

train_data.drop(unrelevant_features,inplace=True,axis=1)
test_data.drop(unrelevant_features,inplace=True,axis=1)
train_data.head()

Unnamed: 0,OriginalTweet,Sentiment
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,advice Talk to your neighbours family to excha...,Positive
2,Coronavirus Australia: Woolworths to give elde...,Positive
3,My food stock is not the only one which is emp...,Positive
4,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [25]:
train_data["Sentiment"].value_counts()


Positive              11422
Negative               9917
Neutral                7713
Extremely Positive     6624
Extremely Negative     5481
Name: Sentiment, dtype: int64

## we have to split our data classes according to the sentiment.

In [33]:
positives_train = train_data[(train_data["Sentiment"] == "Positive") | (train_data["Sentiment"] == "Extremely Positive")]
positives_test = test_data[(test_data["Sentiment"] == "Positive") | (test_data["Sentiment"] == "Extremely Positive")]
print(positives_train["Sentiment"].value_counts())


negatives_train = train_data[(train_data["Sentiment"] == "Negative") | (train_data["Sentiment"] == "Extremely Negative")]
negatives_test = test_data[(test_data["Sentiment"] == "Negative") | (test_data["Sentiment"] == "Extremely Negative")]
print(negatives_train["Sentiment"].value_counts())
positives_train,negatives_train.head()

Positive              11422
Extremely Positive     6624
Name: Sentiment, dtype: int64
Negative              9917
Extremely Negative    5481
Name: Sentiment, dtype: int64


(                                           OriginalTweet           Sentiment
 1      advice Talk to your neighbours family to excha...            Positive
 2      Coronavirus Australia: Woolworths to give elde...            Positive
 3      My food stock is not the only one which is emp...            Positive
 5      As news of the regionÂs first confirmed COVID...            Positive
 6      Cashier at grocery store was sharing his insig...            Positive
 ...                                                  ...                 ...
 41146  #Gold prices rose to a more than 7-year high t...            Positive
 41148  UV light Sterilizer Sanitizer for your mask an...  Extremely Positive
 41150  I never that weÂd be in a situation &amp; wor...            Positive
 41151  @MrSilverScott you are definitely my man. I fe...  Extremely Positive
 41154  You know itÂs getting tough when @KameronWild...            Positive
 
 [18046 rows x 2 columns],
                                   

In [35]:
neutrals_train = train_data[train_data["Sentiment"] == "Neutral"]
neutrals_test = test_data[test_data["Sentiment"] == "Neutral"]
print(neutrals_train["Sentiment"].value_counts())
neutrals_train.head()

Neutral    7713
Name: Sentiment, dtype: int64


Unnamed: 0,OriginalTweet,Sentiment
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
7,Was at the supermarket today. Didn't buy toile...,Neutral
10,All month there hasn't been crowding in the su...,Neutral
16,????? ????? ????? ????? ??\r\r\n?????? ????? ?...,Neutral
17,@eyeonthearctic 16MAR20 Russia consumer survei...,Neutral


## Now we have our lables we should convert them into integers.

In [39]:
import warnings as wrn
wrn.filterwarnings('ignore')

negatives_train["Sentiment"] = 0 
negatives_test["Sentiment"] = 0

positives_train["Sentiment"] = 2
positives_test["Sentiment"] = 2

neutrals_train["Sentiment"] = 1
neutrals_test["Sentiment"] = 1

## We should clean our data by concatanating train and test data and splitting it later after preprocess.

In [44]:
data = pd.concat([positives_train, positives_test, neutrals_train, neutrals_test, negatives_train, negatives_test ],axis=0)
data.reset_index(inplace=True)
data.head()

Unnamed: 0,index,OriginalTweet,Sentiment
0,1,advice Talk to your neighbours family to excha...,2
1,2,Coronavirus Australia: Woolworths to give elde...,2
2,3,My food stock is not the only one which is emp...,2
3,5,As news of the regionÂs first confirmed COVID...,2
4,6,Cashier at grocery store was sharing his insig...,2


In [46]:
clean_Data = []

lemma = WordNetLemmatizer()
s_words = stopwords.words("english")
for text in data["OriginalTweet"]:
    
    text = re.sub(r'http\S+', '', text) # Cleaning all the links
    
    text = re.sub("[^a-zA-Z0-9]"," ",text) # Cleaning everything except alphabets and numericals
    
    text = nltk.word_tokenize(text.lower())  # Tokenizing
    
    text = [lemma.lemmatize(word) for word in text]  # lemmatizing
    
    text = [word for word in text if word not in s_words] # Removing stopwords
    
    text = " ".join(text)
    
    clean_Data.append(text)

## Clean data is vectorized using Bag Of Words(BOW)

In [50]:
vectorizer = CountVectorizer()
BOW = vectorizer.fit_transform(clean_Data)
BOW.shape

(44955, 53569)

## Splitting the data

In [62]:
x_train,x_test,y_train,y_test = train_test_split(BOW,np.asarray(data["Sentiment"]),test_size=0.3)

## We are going to do the classification using Support vector clasifier (SVC)

In [65]:
from sklearn.svm import SVC
import time

start_time = time.time()
model = SVC()
model.fit(x_train,y_train)

end_time = time.time()
process_time = round(end_time-start_time,2)
print("Fitting SVC took {} seconds".format(process_time))

Fitting SVC took 565.78 seconds


In [66]:
predictions = model.predict(x_test)

In [67]:
from sklearn.metrics import accuracy_score,confusion_matrix

print("Accuracy of model is {}%".format(accuracy_score(y_test,predictions) * 100))

Accuracy of model is 76.65900496774672%
