<a href="https://colab.research.google.com/github/harita-gr/AI_ML_Practice/blob/main/ICT_AI_ML_LS_25_BoW_TF_IDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
corpus = ['Data Science is an overlap between Arts and Science','Generally, Arts graduates are right-brained and Science graduates are left-brained', 'Excelling in both Arts and Science at a time becomes difficult','Natural Language Processing is a part of Data Science']

In [13]:
def vectorise_text(corpus):
  bow_model = CountVectorizer();
  dense_vec = bow_model.fit_transform(corpus).todense() #convert array to vector (USE BOW_MODEL or TF_IDF)
  bow_df = pd.DataFrame(dense_vec) #vector is created in alphabetical order of words
  bow_df.columns = sorted(bow_model.vocabulary_)
  return(bow_df)

In [12]:
df = vectorise_text(corpus)
df

Unnamed: 0,data,science,is,an,overlap,between,arts,and,generally,graduates,...,both,at,time,becomes,difficult,natural,language,processing,part,of
0,1,1,0,1,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,2,0
1,0,1,2,1,0,0,0,0,2,0,...,0,1,0,0,0,0,0,1,1,0
2,0,1,0,1,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,1,1
3,0,0,0,0,0,0,0,0,0,1,...,1,0,1,1,0,1,1,0,1,0


In [14]:
!unzip '/content/emotions.zip'

Archive:  /content/emotions.zip
  inflating: test.txt                
  inflating: train.txt               
  inflating: val.txt                 


All the files are ; separated

In [15]:
df_train = pd.read_csv("train.txt", delimiter=';',names = ['text','label'] )
df_val = pd.read_csv("val.txt", delimiter=';',names = ['text','label'] )


In [17]:
df_train.shape

(16000, 2)

In [18]:
df_val.shape

(2000, 2)

In [16]:
pd.set_option('display.max_colwidth',None)
df_train.head()

Unnamed: 0,text,label
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplace i will know that it is still on the property,love
4,i am feeling grouchy,anger


Combining them into a single data frame

In [19]:
data = pd.concat([df_train,df_val])

In [20]:
data.shape

(18000, 2)

In [21]:
data['label'].value_counts()

label
joy         6066
sadness     5216
anger       2434
fear        2149
love        1482
surprise     653
Name: count, dtype: int64

Binary Classification
- Positive Emotion (1) - love, surprise,joy
- Negative Emotion (0) - sadness, anger, fear

In [23]:
def custom_encoder(df):
  df.replace(to_replace=['love','surprise','joy'],value=1, inplace=True)
  df.replace(to_replace=['sadness','anger','fear'],value=0, inplace=True)
  return df

In [24]:
data['label'] = custom_encoder(data['label'])

In [25]:
data.head()

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake,0
2,im grabbing a minute to post i feel greedy wrong,0
3,i am ever feeling nostalgic about the fireplace i will know that it is still on the property,1
4,i am feeling grouchy,0


In [26]:
data.label.value_counts(normalize=True)

label
0    0.544389
1    0.455611
Name: proportion, dtype: float64

## Data Processing

Removing punctuation

In [27]:
import string
def remove_punctuations(text):
  punc_free = ''.join([i for i in text if i not in string.punctuation])
  return punc_free

Word Tokenization


In [28]:
import nltk
nltk.download('punkt')

def tokenize(text):
  words = nltk.word_tokenize(text)
  return words

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Remove stop words

In [29]:
from nltk.corpus import stopwords
nltk.download('stopwords')

sw = stopwords.words("english")

def sw_removed(text):
  sw_text = [i for i in text if i not in sw]
  return sw_text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Lemmatization

In [36]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

word_lem = WordNetLemmatizer()

def lemm(text):
    lemm_text = [word_lem.lemmatize(word) for word in text]
    return lemm_text

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [40]:
def preprocess(df_col):
  corpus = []
  for item in df_col:
    new_item = remove_punctuations(item)
    new_item = new_item.lower() #lowercasing
    new_item = tokenize(new_item)
    new_item = sw_removed(new_item)
    new_ietm = lemm(new_item)
    corpus.append(' '.join(str(x) for x in new_item))
  return corpus

In [41]:
corpus = preprocess(data['text'])

In [42]:
corpus[0:10]

['didnt feel humiliated',
 'go feeling hopeless damned hopeful around someone cares awake',
 'im grabbing minute post feel greedy wrong',
 'ever feeling nostalgic fireplace know still property',
 'feeling grouchy',
 'ive feeling little burdened lately wasnt sure',
 'ive taking milligrams times recommended amount ive fallen asleep lot faster also feel like funny',
 'feel confused life teenager jaded year old man',
 'petronas years feel petronas performed well made huge profit',
 'feel romantic']

## Bag of Words

In [44]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(1,2))
vec_data = cv.fit_transform(corpus)
x = vec_data
y = data['label']

In [45]:
x # Sparse Vector form

<18000x121731 sparse matrix of type '<class 'numpy.int64'>'
	with 311883 stored elements in Compressed Sparse Row format>

In [46]:
y

0       0
1       0
2       0
3       1
4       0
       ..
1995    0
1996    1
1997    1
1998    1
1999    1
Name: label, Length: 18000, dtype: int64

In [47]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators = 100)
clf.fit(x,y)

In [49]:
from sklearn.metrics import accuracy_score
y_pred = clf.predict(x)
accuracy_score(y,y_pred)

0.9996111111111111

Testing

In [50]:
df_test = pd.read_csv("test.txt", delimiter=';',names = ['text','label'] )

In [51]:
df_test.head()

Unnamed: 0,text,label
0,im feeling rather rotten so im not very ambitious right now,sadness
1,im updating my blog because i feel shitty,sadness
2,i never make her separate from me because i don t ever want her to feel like i m ashamed with her,sadness
3,i left with my bouquet of red and yellow tulips under my arm feeling slightly more optimistic than when i arrived,joy
4,i was feeling a little vain when i did this one,sadness


In [53]:
x_test = df_test['text']
y_test = df_test['label']

In [54]:
y_test = custom_encoder(y_test)
y_test

0       0
1       0
2       0
3       1
4       0
       ..
1995    0
1996    0
1997    1
1998    1
1999    0
Name: label, Length: 2000, dtype: int64

In [55]:
x_test = preprocess(x_test)

In [57]:
x_test[0:5]

['im feeling rather rotten im ambitious right',
 'im updating blog feel shitty',
 'never make separate ever want feel like ashamed',
 'left bouquet red yellow tulips arm feeling slightly optimistic arrived',
 'feeling little vain one']

x_test = cv.transform(x_test)
don't do `fit_transform` for test data. since it might produce diff results than train data. use just `transform`


In [58]:
x_test = cv.transform(x_test)

In [59]:
y_pred = clf.predict(x_test)
accuracy_score(y_test,y_pred)

0.9605

Testing by giving our own data

In [60]:
def sentiment(text):
  text = cv.transform(preprocess(text))
  prediction = clf.predict(text)
  if prediction == 0:
    print('-ve Sentiment')
  else:
    print('+ve Sentiment')


In [66]:
text = ['i am happy today!Nothing bad happened']

In [65]:
sentiment(text)

+ve Sentiment


In [67]:
text = ['i am sad']
sentiment(text)

-ve Sentiment
