# The Simple Version

## Import Library

In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

In [2]:
import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping grammars/basque_grammars.zip.
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/biocreative_ppi.zip.
[nltk_data]    | Downloadin

True

In [3]:
np.random.seed(500)

## Import Dataset and take a Quick Look

In [4]:
#data set
url = "https://raw.githubusercontent.com/henrywan910/Machine-Learning/main/train.csv"
df = pd.read_csv(url)
df

Unnamed: 0,tweets,labels
0,sending solidarity whoever doctor manage incre...,Stressed
1,need see hair amp beard gat book appointment b...,Anxious
2,next time meet someone new dont ask ask love,Normal
3,surprise someone love give la senza gift box r...,Lonely
4,raise hand junhoes ocean lotion life rent free...,Normal
...,...,...
29987,originsmp originssmp,Normal
29988,name muhammad asif farooqi im pakistan amp sta...,Lonely
29989,moms mad wont go overnight skiing trip 5 peopl...,Stressed
29990,even tonight youre go bed date someone cannot ...,Normal


In [None]:
df.head()

Unnamed: 0,tweets,labels
0,sending solidarity whoever doctor manage incre...,Stressed
1,need see hair amp beard gat book appointment b...,Anxious
2,next time meet someone new dont ask ask love,Normal
3,surprise someone love give la senza gift box r...,Lonely
4,raise hand junhoes ocean lotion life rent free...,Normal


In [None]:
df["labels"].value_counts()

Anxious     8388
Normal      7976
Stressed    6840
Lonely      6788
Name: labels, dtype: int64

In [None]:
df["tweets"].value_counts()

human need job cant exist amp make art chill cat                                             352
sad thing disinformation truth come damage already people already mi                         334
sad newscloris insanely talented could make laugh cry drop hat always pleasure               267
feel like im start scratch every aspect life kinda sad kinda powerful kinda necessary        262
advisor tell instead try game market need put money 1 tall bank shape li                     235
                                                                                            ... 
petty self anymore im tire try reach help im gonna give sinc                                   1
carrano perfectly capable get anonymous twitterig account share noxious opinion choose n       1
literally iconic                                                                               1
im tire patient                                                                                1
even tonight youre go bed date

In [5]:
df['labels']=df['labels'].apply(lambda x: 'Normal' if x == 'Normal' else 'Alert')

In [None]:
df['labels'].value_counts()

Alert     22016
Normal     7976
Name: labels, dtype: int64

## Upsampling (Don't Run This Part First)

In [None]:
# from sklearn.utils import resample
# new_normal = resample(df[df["labels"] == "Normal"],
#              replace=True,
#              n_samples=len(df[df["labels"] == "Alert"]),
#              random_state=42)

In [None]:
# new_data = pd.concat([df[df["labels"] == "Alert"], new_normal])

In [None]:
# new_data.count()

tweets    44032
labels    44032
dtype: int64

In [None]:
# new_data['labels'].value_counts()

Alert     22016
Normal    22016
Name: labels, dtype: int64

In [None]:
# df = new_data

In [None]:
# df['labels'].value_counts()

Alert     22016
Normal    22016
Name: labels, dtype: int64

In [None]:
#df.reset_index(inplace=True, drop=True)

## Downsampling

In [None]:
from sklearn.utils import resample
alert_downsample = resample(df[df["labels"] == "Alert"],replace = True, n_samples=len(df[df["labels"] == "Normal"]),random_state=42)
data_downsampled = pd.concat([alert_downsample,df[df["labels"] == "Normal"]])
df = data_downsampled

In [None]:
df.head(30)

Unnamed: 0,tweets,labels
21577,front back leave right side really need,Alert
1176,please dont ignore im call ally help black bi ...,Alert
7390,sad dont see celebration enrique tarrio work a...,Alert
29400,youll happy alone,Alert
16368,tire,Alert
15437,love alone time actually necessity,Alert
8592,get tire learn rest quit,Alert
23006,human need job cant exist amp make art chill cat,Alert
6074,today 7th day stream want get affiliate today ...,Alert
29923,aaaaand day another wellknown incredibly talen...,Alert


In [None]:
df['labels'].value_counts()

Alert     7976
Normal    7976
Name: labels, dtype: int64

In [None]:
df['labels']

21577     Alert
1176      Alert
7390      Alert
29400     Alert
16368     Alert
          ...  
29981    Normal
29983    Normal
29984    Normal
29987    Normal
29990    Normal
Name: labels, Length: 15952, dtype: object

In [None]:
df.reset_index(inplace=True, drop=True)

In [None]:
df

Unnamed: 0,tweets,labels
0,front back leave right side really need,Alert
1,please dont ignore im call ally help black bi ...,Alert
2,sad dont see celebration enrique tarrio work a...,Alert
3,youll happy alone,Alert
4,tire,Alert
...,...,...
15947,wait till get play backwards uncover hidden me...,Normal
15948,nashville man love border collie lulu much nam...,Normal
15949,grrrgrgrgr ur right lt3,Normal
15950,originsmp originssmp,Normal


## Data Prepoccessing 

In [6]:
df['tweets'].dropna(inplace=True)
df['tweets'] = [entry.lower() for entry in df['tweets']]
df['tweets']= [ word_tokenize(entry) for entry in df['tweets']]
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(df['tweets']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    df.loc[index,'text_final'] = str(Final_words)

In [None]:
df['labels']

21577    Alert
1176     Alert
7390     Alert
29400    Alert
16368    Alert
         ...  
15945      NaN
15946      NaN
15948      NaN
15949      NaN
15951      NaN
Name: labels, Length: 24082, dtype: object

In [7]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(df['text_final'],df['labels'],test_size=0.2)

In [None]:
y_train

7592      Alert
10051    Normal
8867     Normal
14934    Normal
12550    Normal
          ...  
3790      Alert
8465     Normal
12097    Normal
951       Alert
15194    Normal
Name: labels, Length: 12761, dtype: object

Encode the label

In [8]:
Encoder = LabelEncoder()
y_train = Encoder.fit_transform(y_train)
y_test = Encoder.fit_transform(y_test)

In [None]:
y_train

array([0, 1, 1, ..., 1, 0, 1])

Transform the features

In [9]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(df['text_final'])
X_train_Tfidf = Tfidf_vect.transform(X_train)
X_test_Tfidf = Tfidf_vect.transform(X_test)

Machine Learning

In [None]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(X_train_Tfidf,y_train)
# predict the labels on validation dataset
y_pred_SVM = SVM.predict(X_test_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(y_pred_SVM, y_test)*100,"%")

In [None]:
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(X_train_Tfidf, y_train)

# predict the labels on validation dataset
y_pred_NB = Naive.predict(X_test_Tfidf)

# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(y_pred_NB, y_test)*100,"%")

Naive Bayes Accuracy Score ->  79.34816671889689 %


In [None]:
#Voting
from sklearn.ensemble import VotingClassifier

seed = 7
kfold = model_selection.KFold(n_splits=10)
# create the sub models
estimators = []
model1 = Naive
estimators.append(('NB', model1))
model2 = SVM
estimators.append(('svm', model2))

# create the ensemble model
ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble, X_train_Tfidf,y_train, cv=kfold)
print("Accuracy for VotingClassifier:", results.mean()*100,"%")

Accuracy for VotingClassifier: 83.19091326409124 %


In [None]:
j_test = ['I am good', 'Oh my...', 'pig is died', 'I want to eat pig']
j_pd = pd.DataFrame(j_test)
j_pd.columns = ['tweets']
j_pd.to_csv('./j.csv')

In [None]:
df = pd.read_csv('./j.csv',encoding='latin-1', index_col=[0])
df.head()

Unnamed: 0,tweets
0,I am good
1,Oh my...
2,pig is died
3,I want to eat pig


In [None]:
df['tweets']

0            I am good
1             Oh my...
2          pig is died
3    I want to eat pig
Name: tweets, dtype: object

In [None]:
df['tweets'].dropna(inplace=True)
df['tweets'] = [entry.lower() for entry in df['tweets']]
df['tweets']= [ word_tokenize(entry) for entry in df['tweets']]
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(df['tweets']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    df.loc[index,'text_final'] = str(Final_words)

In [None]:
XJ_Tfidf = Tfidf_vect.transform(df["text_final"])
SVM.predict(XJ_Tfidf)

array([1, 1, 0, 1])

In [None]:
Encoder.inverse_transform(list(SVM.predict(XJ_Tfidf)))

array(['Normal', 'Normal', 'Alert', 'Normal'], dtype=object)