In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, concatenate, Bidirectional, Flatten, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.downloader import download


In [None]:
download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
train_data = pd.read_csv('/content/train.csv', encoding='latin1')
test_data = pd.read_csv('/content/test.csv', encoding='latin1')



In [None]:
test_data.shape

(4815, 9)

In [None]:


# Text Cleaning Function (Optional but recommended)
def clean_text(text):
    try:
        text=str(text)
        text = text.lower()
    except AttributeError:
        text = "" # Replace non-textual data with empty string
    text = re.sub(r"[^a-zA-Z0-9]+", " ", text)
    stop_words = stopwords.words("english")
    text = " ".join([word for word in text.split() if word not in stop_words])
    return text



In [None]:

# Handle missing values (optional)
train_data['text'].fillna("", inplace=True)
test_data['text'].fillna("", inplace=True)



In [None]:

# Apply text cleaning (if enabled)
if True: # Modify this to True/False based on your need
    train_data['text'] = train_data['text'].apply(clean_text)
    test_data['text'] = test_data['text'].apply(clean_text)




In [None]:

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train_data['text'])
text_sequences = tokenizer.texts_to_sequences(train_data['text'])
text_data = pad_sequences(text_sequences, maxlen=50)


In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(test_data['text'])
text_sequences = tokenizer.texts_to_sequences(test_data['text'])
text_data_t = pad_sequences(text_sequences, maxlen=50)


In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

sia = SentimentIntensityAnalyzer()


In [None]:
import nltk
nltk.download('vader_lexicon')


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [None]:
train_data.head()

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,responded going,"I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,sooo sad miss san diego,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,boss bullying,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
3,9642c003ef,interview leave alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,sons put releases already bought,"Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26


In [None]:
res={}
for i ,row in tqdm(train_data.iterrows(),total=len(train_data)):
  text=row['text']
  myid=row['textID']
  res[myid]=sia.polarity_scores(text)

  0%|          | 0/27481 [00:00<?, ?it/s]

In [None]:
df_train = pd.DataFrame(res).T
df_train = df_train.reset_index().rename(columns={'index': 'textID'})
#df_train = df_train.merge(test_data, how='left')

df_train = pd.merge(df_train, train_data, on='textID', how='inner')
df_train.drop(['selected_text'], axis=1,inplace=True)

In [None]:
tes={}
for i ,row in tqdm(test_data.iterrows(),total=len(test_data)):
  text=row['text']
  myid=row['textID']
  res[myid]=sia.polarity_scores(text)

  0%|          | 0/4815 [00:00<?, ?it/s]

In [None]:
df_test= pd.DataFrame(res).T
df_test= df_test.reset_index().rename(columns={'index': 'textID'})
df_test = pd.merge(df_test, test_data, on='textID', how='inner')


In [None]:
df_train.shape

(27481, 13)

In [None]:
df_test

Unnamed: 0,textID,neg,neu,pos,compound,text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,f87dea47db,0.000,1.000,0.000,0.0000,last session day http twitpic com 67ezh,neutral,morning,0-20,Afghanistan,38928346.0,652860.0,60.0
1,96d74cb729,0.000,0.610,0.390,0.7501,shanghai also really exciting precisely skyscr...,positive,noon,21-30,Albania,2877797.0,27400.0,105.0
2,eee518ae67,0.541,0.459,0.000,-0.7096,recession hit veronique branquinho quit compan...,negative,night,31-45,Algeria,43851044.0,2381740.0,18.0
3,01082688c6,0.000,0.213,0.787,0.5719,happy bday,positive,morning,46-60,Andorra,77265.0,470.0,164.0
4,33987a8ee5,0.000,0.615,0.385,0.3612,http twitpic com 4w75p like,positive,noon,60-70,Angola,32866272.0,1246700.0,26.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4810,,0.000,0.000,0.000,0.0000,,,,,,,,
4811,,0.000,0.000,0.000,0.0000,,,,,,,,
4812,,0.000,0.000,0.000,0.0000,,,,,,,,
4813,,0.000,0.000,0.000,0.0000,,,,,,,,


In [None]:
unique_categories = {
    'Time of Tweet': ['Morning', 'Afternoon', 'Evening'],
    'Age of User': ['Young', 'Adult', 'Senior'],
    'Country': ['US', 'UK', 'Other'] # Adjust based on your data
}
unique_categories_list = list(unique_categories.values())
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore', categories=unique_categories_list)
categorical_data = ohe.fit_transform(df_train[['Time of Tweet', 'Age of User', 'Country']])

# Numerical Scaling
scaler = StandardScaler()
numerical_data = scaler.fit_transform(df_train[['Population -2020', 'Land Area (Km²)', 'Density (P/Km²)','neg'	,'neu',	'pos',	'compound']])

# Prepare Target
target = pd.get_dummies(df_train['sentiment']).values

# Train-Test Split
X_train_text, X_dev_text, X_train_cat, X_dev_cat, X_train_num, X_dev_num, y_train, y_dev = train_test_split(
    text_data, categorical_data, numerical_data, target, test_size=0.2, random_state=42
)


In [None]:
total_classes = y.nunique()
print("Number of unique species in dataset are: ",total_classes)

Number of unique species in dataset are:  3


In [None]:
distribution = y.value_counts()
print(distribution)

sentiment
neutral     11118
positive     8582
negative     7781
Name: count, dtype: int64


In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
import warnings
warnings.filterwarnings("ignore")

In [None]:
print(df_train.dtypes)


neg                 float64
neu                 float64
pos                 float64
compound            float64
text                 object
sentiment            object
Time of Tweet        object
Age of User          object
Country              object
Population -2020      int64
Land Area (Km²)     float64
Density (P/Km²)       int64
dtype: object


In [None]:
def preprocess_test_data(df, tokenizer, ohe, scaler):
    preprocessed_data = []

    for idx, row in df.iterrows():
        if pd.isnull(row['text']) or pd.isna(row['text']):
            continue

        seq = tokenizer.texts_to_sequences([row['text']])
        text_data = pad_sequences(seq, maxlen=50)

        cat_columns = ['Time of Tweet', 'Age of User', 'Country']
        cat_data = ohe.transform([row[cat_columns]])

        num_columns = ['Population -2020', 'Land Area (Km²)', 'Density (P/Km²)', 'neg', 'neu', 'pos', 'compound']
        num_data = scaler.transform([row[num_columns]])

        preprocessed_data.append((text_data, cat_data, num_data))

    return preprocessed_data


# Handle missing values and preprocess
preprocessed_data = preprocess_test_data(df_train, tokenizer, ohe, scaler)


In [None]:
# Combine the text, categorical, and numerical data into a single feature matrix
X_train_combined = np.concatenate((X_train_text, X_train_cat, X_train_num), axis=1)
# Convert one-hot encoded y_train to class labels
y_train_labels = np.argmax(y_train, axis=1)

# Creating adaboost classifier model
adb = AdaBoostClassifier()
adb_model = adb.fit(X_train_combined, y_train_labels)


In [None]:
print(X_dev_text.shape)
print(X_dev_cat.shape)
print(X_dev_num.shape)
print(y_dev.shape)

(5497, 50)
(5497, 9)
(5497, 7)
(5497, 3)


In [None]:
X_val=np.concatenate(( X_dev_text, X_dev_cat, X_dev_num), axis=1)
Y_val=np.argmax(y_dev, axis=1)
print("The accuracy of the model on validation set is", adb_model.score(X_val,Y_val))


The accuracy of the model on validation set is 0.6716390758595597


In [None]:
# Model Building
text_input = Input(shape=(50,))
text_embed = Embedding(input_dim=10000, output_dim=128)(text_input)
text_out = Bidirectional(LSTM(64, return_sequences=True))(text_embed)
text_out_flattened = Flatten()(text_out)

cat_input = Input(shape=(categorical_data.shape[1],))
num_input = Input(shape=(numerical_data.shape[1],))

merged = concatenate([text_out_flattened, cat_input, num_input])
merged = Dropout(0.2)(merged)

dense = Dense(64, activation='relu')(merged)
output = Dense(3, activation='softmax')(dense)

model = Model(inputs=[text_input, cat_input, num_input], outputs=output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [None]:
model.fit([X_train_text, X_train_cat, X_train_num], y_train, epochs=10, batch_size=32, validation_data=([X_dev_text, X_dev_cat, X_dev_num], y_dev))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7fcbdc137550>

In [None]:
def preprocess_and_predict(df, model, tokenizer, ohe, scaler):
    predictions = []
    indices = []

    for idx, row in df.iterrows():
        if pd.isnull(row['text']) or pd.isna(row['text']):
            continue

        seq = tokenizer.texts_to_sequences([row['text']])
        text_data = pad_sequences(seq, maxlen=50)

        if pd.isnull(row[['Time of Tweet', 'Age of User', 'Country']]).any():
            continue
        cat_data = ohe.transform([row[['Time of Tweet', 'Age of User', 'Country']]])

        if pd.isnull(row[['Population -2020', 'Land Area (Km²)', 'Density (P/Km²)','neg', 'neu', 'pos', 'compound']]).any():
            continue
        num_columns = ['Population -2020', 'Land Area (Km²)', 'Density (P/Km²)', 'neg', 'neu', 'pos', 'compound']
        num_data = scaler.transform([row[num_columns]])
        pred = model.predict([text_data, cat_data, num_data], verbose=0)
        predicted_class = np.argmax(pred, axis=1)
        predictions.append(predicted_class)
        indices.append(idx)

    return predictions, indices


In [None]:
# Handle missing values and predict
predicted_labels, valid_indices = preprocess_and_predict(df_test, model, tokenizer, ohe, scaler)

# Extract actual labels for valid indices
actual_labels = df_test.loc[valid_indices, 'sentiment']
actual_labels_encoded = pd.get_dummies(actual_labels).values

# Convert actual_labels_encoded to class index format
actual_labels_index = np.argmax(actual_labels_encoded, axis=1)

# Evaluate Model Performance
print(classification_report(actual_labels_index, predicted_labels))


              precision    recall  f1-score   support

           0       0.63      0.63      0.63      1001
           1       0.63      0.62      0.62      1430
           2       0.69      0.70      0.70      1103

    accuracy                           0.65      3534
   macro avg       0.65      0.65      0.65      3534
weighted avg       0.65      0.65      0.65      3534

