In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train_data = pd.read_csv('/kaggle/input/sentiment-analysis-dataset/train.csv',encoding='unicode_escape');
test_data = pd.read_csv('/kaggle/input/sentiment-analysis-dataset/test.csv',encoding='latin1');

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
train_data.describe()

In [None]:
train_data.isna().any()

In [None]:
test_data.isna().any()

In [None]:
train_data = train_data.dropna()

In [None]:
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train_data['text'])
text_sequences = tokenizer.texts_to_sequences(train_data['text'])
text_data = pad_sequences(text_sequences, maxlen=50)

# Encode categorical data
ohe = OneHotEncoder()
categorical_data = ohe.fit_transform(train_data[['Time of Tweet', 'Age of User', 'Country']]).toarray()

# Scale numerical data
scaler = StandardScaler()
numerical_data = scaler.fit_transform(train_data[['Population -2020', 'Land Area (Km²)', 'Density (P/Km²)']])

# Prepare target
target = pd.get_dummies(train_data['sentiment']).values

In [None]:
# Split the data
X_train_text, X_dev_text, X_train_cat, X_dev_cat, X_train_num, X_dev_num, y_train, y_dev = train_test_split(
    text_data, categorical_data, numerical_data, target, test_size=0.2, random_state=42
)

# Model
text_input = Input(shape=(50,))
text_embed = Embedding(input_dim=10000, output_dim=128)(text_input)
text_out = LSTM(64)(text_embed)

cat_input = Input(shape=(categorical_data.shape[1],))
num_input = Input(shape=(numerical_data.shape[1],))

merged = concatenate([text_out, cat_input, num_input])

dense = Dense(64, activation='relu')(merged)
output = Dense(3, activation='softmax')(dense)

model = Model(inputs=[text_input, cat_input, num_input], outputs=output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Training
model.fit([X_train_text, X_train_cat, X_train_num], y_train, epochs=20, batch_size=32)


In [None]:
# Evaluate the model
model.evaluate([X_dev_text, X_dev_cat, X_dev_num], y_dev)

In [None]:
# Function to preprocess and predict sentiment
def preprocess_and_predict(df, model, tokenizer, ohe, scaler):
    predictions = []
    indices = []

    for idx, row in df.iterrows():
        # Check for nulls in the columns needed for each input type
        if pd.isnull(row['text']) or pd.isna(row['text']):
            continue  # Skip this row if text data is missing

        # Prepare text data
        seq = tokenizer.texts_to_sequences([row['text']])
        text_data = pad_sequences(seq, maxlen=50)

        # Prepare categorical data
        if pd.isnull(row[['Time of Tweet', 'Age of User', 'Country']]).any():
            continue  # Skip this row if any categorical data is missing

        cat_data = ohe.transform([row[['Time of Tweet', 'Age of User', 'Country']]]).toarray()

        # Prepare numerical data
        if pd.isnull(row[['Population -2020', 'Land Area (Km²)', 'Density (P/Km²)']]).any():
            continue  # Skip this row if any numerical data is missing

        num_data = scaler.transform([row[['Population -2020', 'Land Area (Km²)', 'Density (P/Km²)']]])

        # Predict sentiment
        pred = model.predict([text_data, cat_data, num_data],verbose=0)
        predicted_class = np.argmax(pred, axis=1)
        predictions.append(predicted_class)
        indices.append(idx)

    return predictions, indices

# Clean and preprocess test data
df_test = test_data

# Handle missing values and predict
predicted_labels, valid_indices = preprocess_and_predict(df_test, model, tokenizer, ohe, scaler)

# Extract actual labels for valid indices
actual_labels = df_test.loc[valid_indices, 'sentiment']
actual_labels_encoded = pd.get_dummies(actual_labels).values


In [None]:
# Convert one-hot encoded actual labels to class labels
actual_labels_encoded = pd.get_dummies(actual_labels).values
actual_labels_class = np.argmax(actual_labels_encoded, axis=1)

# Generate classification report
report = classification_report(actual_labels_class, np.array(predicted_labels).flatten(), target_names=['negative', 'neutral', 'positive'])
print(report)

In [None]:
# Plotting figure of Time of Tweet vs Sentiment
df = train_data[['Time of Tweet','sentiment']]
plt.figure(figsize=(10, 6))
sns.countplot(x='Time of Tweet', hue='sentiment', data=df)
plt.title('Sentiment Distribution by Time of Tweet')
plt.xlabel('Time of Tweet')
plt.ylabel('Count')
plt.legend(title='Sentiment')
plt.show()

In [None]:
df = train_data[['Age of User','sentiment']]

# Create a cross-tabulation
age_sentiment = pd.crosstab(df['Age of User'], df['sentiment'])

# Plotting Age vs Sentiment
age_sentiment.plot(kind='bar', stacked=True, figsize=(10, 6))
plt.title('Sentiment Distribution by Age of User')
plt.xlabel('Age of User')
plt.ylabel('Count')
plt.legend(title='Sentiment')
plt.show()

In [None]:
df = train_data[['Country','sentiment']]

# Plotting
plt.figure(figsize=(12, 7))
sns.countplot(x='Country', hue='sentiment', data=df, order=df['Country'].value_counts().iloc[:5].index)
plt.title('Top 5 Countries by Sentiment Distribution')
plt.xlabel('Country')
plt.ylabel('Count')
plt.legend(title='Sentiment')
plt.show()

In [None]:
sentiment_counts = train_data['sentiment'].value_counts()
# Plotting of Distribution of Sentiments
plt.figure(figsize=(8, 8))
plt.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=140, colors=['lightblue', 'salmon', 'lightgreen'])
plt.title('Distribution of Sentiments')
plt.show()