In [104]:
import numpy as np
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer #TfidfVectorizer is used for text representation 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout
from tensorflow.keras.optimizers import Adam
import seaborn as sns
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence  import pad_sequences
from tensorflow.keras.models  import Sequential
from tensorflow.keras.layers  import Embedding,SimpleRNN,LSTM,Dense

In [105]:
training_df=pd.read_csv("twitter_training.csv")
training_df

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...
...,...,...,...,...
74676,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74677,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74678,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74679,9200,Nvidia,Positive,Just realized between the windows partition of...


In [109]:
training_df.columns = ['id', 'game', 'label', 'text']

In [110]:
training_df

Unnamed: 0,id,game,label,text
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...
...,...,...,...,...
74676,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74677,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74678,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74679,9200,Nvidia,Positive,Just realized between the windows partition of...


In [144]:
training_df["label"].value_counts()

label
Negative      22542
Positive      20831
Neutral       18318
Irrelevant    12990
Name: count, dtype: int64

In [112]:
training_df.iloc[:, 2]# retrieve all rows and just take coloumn with index 2

0        Positive
1        Positive
2        Positive
3        Positive
4        Positive
           ...   
74676    Positive
74677    Positive
74678    Positive
74679    Positive
74680    Positive
Name: label, Length: 74681, dtype: object

In [113]:
training_df.iloc[:, 2].value_counts()

label
Negative      22542
Positive      20831
Neutral       18318
Irrelevant    12990
Name: count, dtype: int64

In [114]:
training_df.iloc[:, 3]#take all rows with coloumn index 3

0        I am coming to the borders and I will kill you...
1        im getting on borderlands and i will kill you ...
2        im coming on borderlands and i will murder you...
3        im getting on borderlands 2 and i will murder ...
4        im getting into borderlands and i can murder y...
                               ...                        
74676    Just realized that the Windows partition of my...
74677    Just realized that my Mac window partition is ...
74678    Just realized the windows partition of my Mac ...
74679    Just realized between the windows partition of...
74680    Just like the windows partition of my Mac is l...
Name: text, Length: 74681, dtype: object

In [115]:
nlp=spacy.load("en_core_web_md\en_core_web_md-3.8.0")

In [116]:
text=training_df.iloc[2,3]

In [117]:
doc=nlp(text)
for token in doc:
    print(token)

i
m
coming
on
borderlands
and
i
will
murder
you
all
,


In [118]:
# logically you do a preprocess function and apply it to you dataframe so lets do it:
def preprocess(text):
    doc=nlp(text)
    filtered=[]
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered.append(token.lemma_)
    return " ".join(filtered)

In [119]:
training_df.iloc[:,3]

0        I am coming to the borders and I will kill you...
1        im getting on borderlands and i will kill you ...
2        im coming on borderlands and i will murder you...
3        im getting on borderlands 2 and i will murder ...
4        im getting into borderlands and i can murder y...
                               ...                        
74676    Just realized that the Windows partition of my...
74677    Just realized that my Mac window partition is ...
74678    Just realized the windows partition of my Mac ...
74679    Just realized between the windows partition of...
74680    Just like the windows partition of my Mac is l...
Name: text, Length: 74681, dtype: object

In [120]:
training_df.isnull().sum()# there are missing values 

id         0
game       0
label      0
text     686
dtype: int64

In [121]:
training_df["text"]=training_df["text"].fillna("unknown")

In [123]:
training_df["preprocessed text"]=training_df["text"].apply(preprocess)
training_df

Unnamed: 0,id,game,label,text,preprocessed text
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,come border kill
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,m get borderland kill
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,m come borderland murder
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,m get borderland 2 murder
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...,m get borderland murder
...,...,...,...,...,...
74676,9200,Nvidia,Positive,Just realized that the Windows partition of my...,realize Windows partition Mac like 6 year Nvid...
74677,9200,Nvidia,Positive,Just realized that my Mac window partition is ...,realize Mac window partition 6 year Nvidia dri...
74678,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...,realize window partition Mac 6 year Nvidia dri...
74679,9200,Nvidia,Positive,Just realized between the windows partition of...,realize windows partition Mac like 6 year Nvid...


In [124]:
encoder=LabelEncoder()
training_df["category"]=encoder.fit_transform(training_df["label"])

In [125]:
x=training_df["preprocessed text"]
y=training_df["category"]

In [126]:
x

0                                         come border kill
1                                    m get borderland kill
2                                 m come borderland murder
3                                m get borderland 2 murder
4                                  m get borderland murder
                               ...                        
74676    realize Windows partition Mac like 6 year Nvid...
74677    realize Mac window partition 6 year Nvidia dri...
74678    realize window partition Mac 6 year Nvidia dri...
74679    realize windows partition Mac like 6 year Nvid...
74680    like window partition Mac like 6 year driver i...
Name: preprocessed text, Length: 74681, dtype: object

In [127]:
y

0        3
1        3
2        3
3        3
4        3
        ..
74676    3
74677    3
74678    3
74679    3
74680    3
Name: category, Length: 74681, dtype: int64

In [128]:
v=TfidfVectorizer()
x_vec=v.fit_transform(x)

In [129]:
x_vec

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 712808 stored elements and shape (74681, 27958)>

In [130]:
x_train,x_test,y_train,y_test=train_test_split(x_vec,y,random_state=101,test_size=0.2)

In [131]:
lr=LogisticRegression()
lr.fit(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [132]:
y_pred=lr.predict(x_test)

y_pred

array([1, 2, 1, ..., 2, 3, 3])

In [133]:
print(accuracy_score(y_pred,y_test))

0.766619803173328


now we can use rnn to make classification also

In [164]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(training_df["text"])


In [165]:
tokenizer.word_index

{'the': 1,
 'i': 2,
 'to': 3,
 'and': 4,
 'a': 5,
 'of': 6,
 'is': 7,
 'in': 8,
 'for': 9,
 'this': 10,
 'it': 11,
 'you': 12,
 'on': 13,
 'my': 14,
 'that': 15,
 'com': 16,
 'with': 17,
 'game': 18,
 'so': 19,
 'be': 20,
 'me': 21,
 'have': 22,
 'just': 23,
 'but': 24,
 'not': 25,
 'are': 26,
 'all': 27,
 'at': 28,
 'was': 29,
 'like': 30,
 'out': 31,
 'from': 32,
 '2': 33,
 'your': 34,
 'pic': 35,
 'twitter': 36,
 'now': 37,
 'get': 38,
 'we': 39,
 'as': 40,
 'they': 41,
 'has': 42,
 'if': 43,
 'one': 44,
 'do': 45,
 'good': 46,
 't': 47,
 'about': 48,
 'can': 49,
 'play': 50,
 'no': 51,
 'will': 52,
 'an': 53,
 'new': 54,
 'really': 55,
 'love': 56,
 'when': 57,
 'up': 58,
 "i'm": 59,
 'unk': 60,
 'what': 61,
 'more': 62,
 'time': 63,
 'by': 64,
 'johnson': 65,
 'how': 66,
 'people': 67,
 'some': 68,
 'or': 69,
 'why': 70,
 '3': 71,
 'see': 72,
 'shit': 73,
 "it's": 74,
 'co': 75,
 'been': 76,
 'best': 77,
 'still': 78,
 'facebook': 79,
 '’': 80,
 'https': 81,
 'got': 82,
 'games': 

In [166]:
sequences=tokenizer.texts_to_sequences(training_df["text"])

In [167]:
max_sequence_length=max(len(x) for x in sequences ) 
max_sequence_length

166

In [168]:
x=pad_sequences(sequences , maxlen=max_sequence_length ,padding="pre")

In [169]:
x

array([[   0,    0,    0, ...,  435,   12,   27],
       [   0,    0,    0, ...,  435,   12,   27],
       [   0,    0,    0, ..., 1794,   12,   27],
       ...,
       [   0,    0,    0, ...,  118, 1026, 2158],
       [   0,    0,    0, ...,   80,   47, 2158],
       [   0,    0,    0, ...,    2, 1026, 2158]], dtype=int32)

In [198]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)
y=encoder.fit_transform(training_df[["label"]])

In [199]:
y.shape

(74681, 4)

In [202]:
y

array([[0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       ...,
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.]])

In [203]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [204]:
vocab_size=len(tokenizer.word_index)+1
vocab_size

33784

In [205]:
embedding_dim=100# we will represent each word will be represented in 100 point 
rnn_model=Sequential([
    Embedding(vocab_size,embedding_dim,input_length=max_sequence_length),##input size equal size of longethest message that is max_sequence_length
    SimpleRNN(128,activation="tanh",return_sequences=True),#return_sequences="true" means bind this layer with next layer
    SimpleRNN(128,activation="tanh",return_sequences=False),# as long as this is the last layer thenfore by default return_sequences="False"
    Dense(4,activation="softmax"),#output layer
    
])

rnn_model.compile(optimizer="adam",loss="categorical_crossentropy",metrics=["accuracy"])



In [206]:
y.shape

(74681, 4)

In [211]:
y_train

array([[0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       ...,
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.]])

In [208]:
x.shape

(74681, 166)

In [209]:
x_train

array([[   0,    0,    0, ...,    4,  796, 4284],
       [   0,    0,    0, ...,  625,    6, 4136],
       [   0,    0,    0, ..., 1102, 2643,  141],
       ...,
       [   0,    0,    0, ...,  447, 2601, 2654],
       [   0,    0,    0, ...,  784,  206,   86],
       [   0,    0,    0, ...,   13,   10,  341]], dtype=int32)

In [210]:
rnn_model.fit(x_train , y_train ,epochs=5,batch_size=64,validation_data=(x_test,y_test))#enter 934 batches in one eboch 

Epoch 1/5
[1m934/934[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m157s[0m 164ms/step - accuracy: 0.3558 - loss: 1.3471 - val_accuracy: 0.5873 - val_loss: 1.0282
Epoch 2/5
[1m934/934[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 152ms/step - accuracy: 0.6512 - loss: 0.8851 - val_accuracy: 0.7301 - val_loss: 0.6990
Epoch 3/5
[1m934/934[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 149ms/step - accuracy: 0.8001 - loss: 0.5387 - val_accuracy: 0.7913 - val_loss: 0.5662
Epoch 4/5
[1m934/934[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m145s[0m 156ms/step - accuracy: 0.8659 - loss: 0.3662 - val_accuracy: 0.8190 - val_loss: 0.5174
Epoch 5/5
[1m934/934[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 150ms/step - accuracy: 0.8944 - loss: 0.2911 - val_accuracy: 0.8201 - val_loss: 0.5255


<keras.src.callbacks.history.History at 0x1e474b9d1d0>

When you're using the evaluate() method on a model, you don't need to manually make predictions using model.predict() beforehand. The evaluate() method internally handles the prediction for you.
x_test is the input data for evaluation (e.g., your test features).

y_test is the true target labels (e.g., the correct class labels for the test data).

evaluate() will:

Automatically make predictions on x_test using the model.

Compare those predictions to y_test.

Compute the loss and any metrics (like accuracy) you specified when compiling the model.
dont say print(accuracy_score(y_pred,y_test)) because evaluate direct calcilate accuracy,loss


In [212]:
y_pred=rnn_model.predict(x_test)

[1m467/467[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 40ms/step


In [214]:
y_pred

array([[3.6931258e-01, 2.7461737e-01, 3.1087002e-01, 4.5200020e-02],
       [3.9561071e-02, 1.8072850e-03, 3.3911860e-03, 9.5524043e-01],
       [1.6880163e-03, 9.9009389e-01, 5.4718568e-03, 2.7462728e-03],
       ...,
       [2.0377205e-03, 1.6042840e-04, 9.5295757e-01, 4.4844307e-02],
       [1.2189784e-02, 9.7358352e-01, 1.1378725e-02, 2.8479537e-03],
       [1.8757001e-04, 8.3513032e-06, 9.9963903e-01, 1.6508508e-04]],
      dtype=float32)

In [215]:
y_test

array([[1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       ...,
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.]])

In [217]:
rnn_model.evaluate(x_test,y_test)

[1m467/467[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 32ms/step - accuracy: 0.8179 - loss: 0.5302


[0.5254740118980408, 0.8201111555099487]

Why evaluate() is convenient:
It saves you the step of having to run predictions manually and then comparing those predictions to y_test.

It’s a single call to get the loss and any metrics for your model on the test data.