In [1]:
import tensorflow
from tensorflow import keras
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from keras import layers,Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split



## Data Input

In [2]:
df=pd.read_csv('/kaggle/input/sentimental-analysis-for-tweets/sentiment_tweets3.csv')
df.head()

Unnamed: 0,Index,message to examine,label (depression result)
0,106,just had a real good moment. i missssssssss hi...,0
1,217,is reading manga http://plurk.com/p/mzp1e,0
2,220,@comeagainjen http://twitpic.com/2y2lx - http:...,0
3,288,@lapcat Need to send 'em to my accountant tomo...,0
4,540,ADD ME ON MYSPACE!!! myspace.com/LookThunder,0


## Number of labels 

In [3]:
df['label (depression result)'].unique()

array([0, 1])

In [4]:
X=df['message to examine']
y=df['label (depression result)']

In [5]:
df.describe()

Unnamed: 0,Index,label (depression result)
count,10314.0,10314.0
mean,491253.470525,0.224355
std,261688.134407,0.417177
min,106.0,0.0
25%,263019.25,0.0
50%,514988.0,0.0
75%,773521.5,0.0
max,802313.0,1.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10314 entries, 0 to 10313
Data columns (total 3 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Index                      10314 non-null  int64 
 1   message to examine         10314 non-null  object
 2   label (depression result)  10314 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 241.9+ KB


In [7]:
X

0        just had a real good moment. i missssssssss hi...
1               is reading manga  http://plurk.com/p/mzp1e
2        @comeagainjen http://twitpic.com/2y2lx - http:...
3        @lapcat Need to send 'em to my accountant tomo...
4            ADD ME ON MYSPACE!!!  myspace.com/LookThunder
                               ...                        
10309    No Depression by G Herbo is my mood from now o...
10310    What do you do when depression succumbs the br...
10311    Ketamine Nasal Spray Shows Promise Against Dep...
10312    dont mistake a bad day with depression! everyo...
10313                                                    0
Name: message to examine, Length: 10314, dtype: object

In [8]:
y

0        0
1        0
2        0
3        0
4        0
        ..
10309    1
10310    1
10311    1
10312    1
10313    1
Name: label (depression result), Length: 10314, dtype: int64

## Data split for training and testing

In [10]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [11]:
print('X_train :',len(X_train))
print('X_test :',len(X_test))
print('y_train :',len(y_train))
print('y_test :', len(y_test))

X_train : 8251
X_test : 2063
y_train : 8251
y_test : 2063


## Tokenizing  text 

In [12]:
tokenizer=Tokenizer(num_words=999,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',lower=True,split=" ")

In [13]:
tokenizer.fit_on_texts(X)

In [14]:
X_train_tok=tokenizer.texts_to_sequences(X_train)
X_test_tok=tokenizer.texts_to_sequences(X_test)

In [15]:
X_train.head()

6244    http://twitpic.com/6rzlk - what I see when I w...
2124                    going to NJ for cats competition 
3601       @nere13 Heya  I'm fine thanks  wuu2 today? xxx
1295    @mmitchelldaviss What colour is your toothbrush? 
8849    @Lucy_Nichol78 @RealMissAnxiety I can see it b...
Name: message to examine, dtype: object

In [16]:
X_train_tok[0]

[26, 175, 19, 48, 1, 74, 69, 1, 798, 35, 8]

## Padding the tokenized text to create a consistent training input

In [17]:
X_train_pad=pad_sequences(X_train_tok,maxlen=100)
X_test_pad=pad_sequences(X_test_tok,maxlen=100)

In [18]:
X_train_pad[0]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  26, 175,
        19,  48,   1,  74,  69,   1, 798,  35,   8], dtype=int32)

In [19]:
X_test_pad[0]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 488,  23,
       218, 131, 519,   3,   5,  23, 639,   2,  11], dtype=int32)

## Number of words in the vocabulary

In [21]:
len(tokenizer.word_index)

23195

## CNN model creation 

In [22]:
 cnn = Sequential([
     keras.layers.Embedding(len(tokenizer.word_index), 32, input_length=100),
     
     keras.layers.Conv1D(16,3,activation="relu"),
     keras.layers.MaxPool1D(2),
     
     keras.layers.Conv1D(32,3,activation="relu"),
     keras.layers.MaxPool1D(2),
     
     keras.layers.Flatten(),
     
     keras.layers.Dense(64,activation="relu"),
     keras.layers.Dense(1,activation="sigmoid")
 ])

## CNN model's parameters

In [23]:
cnn.compile(optimizer="adam",metrics=["accuracy"],loss=["binary_crossentropy"])

## CNN model training

In [24]:
cnn.fit(
    X_train_pad,
    y_train,
    epochs=10,
    validation_split=0.2
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7e5ba254a4a0>

## CNN model predictions

In [25]:
results = cnn.predict(X_test_pad)



In [26]:
results = results.round()

In [28]:
y_test_np=np.asarray(y_test,dtype=np.int32)

In [30]:
cnn.evaluate(X_test_pad,y_test_np)



[0.33744630217552185, 0.9520116448402405]