<a href="https://colab.research.google.com/github/kanchan14kumari/sentiment-and-emotion-analysis-of-codemixed-data/blob/main/Basicmodal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import re
import json


# All the sklearn imports
from sklearn import metrics
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.validation import check_is_fitted, check_X_y
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet, stopwords
from nltk.tokenize import word_tokenize

# Load datasets from CSV files
train_df = pd.read_csv("preprocessed_train.csv")
test_df = pd.read_csv("preprocessed_test.csv")
val_df = pd.read_csv("preprocessed_val.csv")

# Fill missing values in each dataset
train_df.fillna("", inplace=True)
val_df.fillna("", inplace=True)
test_df.fillna("", inplace=True)

# Separate text and label columns
X_train_emotion = train_df['tweet']
y_train_emotion = train_df['emotion']

X_val_emotion = val_df['tweet']
y_val_emotion = val_df['emotion']

X_test_emotion = test_df['tweet']
y_test_emotion = test_df['emotion']

# Vectorizing the text data using TF-IDF for emotion
vectorizer_emotion = TfidfVectorizer(sublinear_tf=True, encoding='utf-8', decode_error='ignore', stop_words='english')
X_train_tfidf_emotion = vectorizer_emotion.fit_transform(X_train_emotion)
X_val_tfidf_emotion = vectorizer_emotion.transform(X_val_emotion)
X_test_tfidf_emotion = vectorizer_emotion.transform(X_test_emotion)

# Logistic Regression model for emotion
lr_model_emotion = LogisticRegression(max_iter=1000)
lr_model_emotion.fit(X_train_tfidf_emotion, y_train_emotion)
y_pred_lr_emotion = lr_model_emotion.predict(X_test_tfidf_emotion)

accuracy_lr = accuracy_score(y_test_emotion, y_pred_lr_emotion)
precision_lr = precision_score(y_test_emotion, y_pred_lr_emotion, average='weighted')
recall_lr = recall_score(y_test_emotion, y_pred_lr_emotion, average='weighted')
f1_score_lr = f1_score(y_test_emotion, y_pred_lr_emotion, average='weighted')


print("Logistic Regression:")
print("Accuracy: ", accuracy_lr)
print("Precision: ", precision_lr)
print("Recall: ", recall_lr)
print("F1 Score: ", f1_score_lr)

Logistic Regression:
Accuracy:  0.6113333333333333
Precision:  0.6439097903913795
Recall:  0.6113333333333333
F1 Score:  0.5817662722406894


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Separate text and label columns for sentiment
X_train_sentiment = train_df['tweet']
y_train_sentiment = train_df['sentiment']

X_val_sentiment = val_df['tweet']
y_val_sentiment = val_df['sentiment']

X_test_sentiment = test_df['tweet']
y_test_sentiment = test_df['sentiment']

# Vectorizing the text data using TF-IDF for sentiment
vectorizer_sentiment = TfidfVectorizer(sublinear_tf=True, encoding='utf-8', decode_error='ignore', stop_words='english')
X_train_tfidf_sentiment = vectorizer_sentiment.fit_transform(X_train_sentiment)
X_val_tfidf_sentiment = vectorizer_sentiment.transform(X_val_sentiment)
X_test_tfidf_sentiment = vectorizer_sentiment.transform(X_test_sentiment)

# Logistic Regression model for sentiment
lr_model_sentiment = LogisticRegression(max_iter=1000)
lr_model_sentiment.fit(X_train_tfidf_sentiment, y_train_sentiment)
y_pred_lr_sentiment = lr_model_sentiment.predict(X_test_tfidf_sentiment)

accuracy_lr = accuracy_score(y_test_sentiment, y_pred_lr_sentiment)
precision_lr = precision_score(y_test_sentiment, y_pred_lr_sentiment, average='weighted')
recall_lr = recall_score(y_test_sentiment, y_pred_lr_sentiment, average='weighted')
f1_score_lr = f1_score(y_test_sentiment, y_pred_lr_sentiment, average='weighted')


print("Logistic Regression:")
print("Accuracy: ", accuracy_lr)
print("Precision: ", precision_lr)
print("Recall: ", recall_lr)
print("F1 Score: ", f1_score_lr)

Logistic Regression:
Accuracy:  0.6593333333333333
Precision:  0.6626375636850009
Recall:  0.6593333333333333
F1 Score:  0.660589246533395


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Separate features (tweets) and labels (emotion and sentiment)
X_train = train_df['tweet']
y_train_emotion = train_df['emotion']
y_train_sentiment = train_df['sentiment']

X_test = test_df['tweet']
y_test_emotion = test_df['emotion']
y_test_sentiment = test_df['sentiment']

# Combine emotion and sentiment labels
y_train_combined = y_train_emotion + "_" + y_train_sentiment
y_test_combined = y_test_emotion + "_" + y_test_sentiment

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(sublinear_tf=True, encoding='utf-8', decode_error='ignore', stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train the logistic regression model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, y_train_combined)

# Make predictions
y_pred_combined = lr_model.predict(X_test_tfidf)

# Calculate evaluation metrics
accuracy_combined = accuracy_score(y_test_combined, y_pred_combined)
precision_combined = precision_score(y_test_combined, y_pred_combined, average='weighted')
recall_combined = recall_score(y_test_combined, y_pred_combined, average='weighted')
f1_score_combined = f1_score(y_test_combined, y_pred_combined, average='weighted')

# Print evaluation metrics
print("Combined Evaluation Metrics:")
print("Accuracy:", accuracy_combined)
print("Precision:", precision_combined)
print("Recall:", recall_combined)
print("F1 Score:", f1_score_combined)


Combined Evaluation Metrics:
Accuracy: 0.546
Precision: 0.5112821046359856
Recall: 0.546
F1 Score: 0.4879249640768176


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#implementing decision tree
from sklearn.tree import DecisionTreeClassifier

# Train Decision Tree model for sentiment
dt_model_sentiment = DecisionTreeClassifier()
dt_model_sentiment.fit(X_train_tfidf, y_train_sentiment)
y_pred_dt_sentiment = dt_model_sentiment.predict(X_test_tfidf)

# Calculate metrics for sentiment
accuracy_dt_sentiment = accuracy_score(y_test_sentiment, y_pred_dt_sentiment)
precision_dt_sentiment = precision_score(y_test_sentiment, y_pred_dt_sentiment, average='weighted')
recall_dt_sentiment = recall_score(y_test_sentiment, y_pred_dt_sentiment, average='weighted')
f1_score_dt_sentiment = f1_score(y_test_sentiment, y_pred_dt_sentiment, average='weighted')

print("\nDecision Tree for Sentiment:")
print("Accuracy: ", accuracy_dt_sentiment)
print("Precision: ", precision_dt_sentiment)
print("Recall: ", recall_dt_sentiment)
print("F1 Score: ", f1_score_dt_sentiment)

# Train Decision Tree model for emotion
dt_model_emotion = DecisionTreeClassifier()
dt_model_emotion.fit(X_train_tfidf, y_train_emotion)
y_pred_dt_emotion = dt_model_emotion.predict(X_test_tfidf)

# Calculate metrics for emotion
accuracy_dt_emotion = accuracy_score(y_test_emotion, y_pred_dt_emotion)
precision_dt_emotion = precision_score(y_test_emotion, y_pred_dt_emotion, average='weighted')
recall_dt_emotion = recall_score(y_test_emotion, y_pred_dt_emotion, average='weighted')
f1_score_dt_emotion = f1_score(y_test_emotion, y_pred_dt_emotion, average='weighted')

print("\nDecision Tree for Emotion:")
print("Accuracy: ", accuracy_dt_emotion)
print("Precision: ", precision_dt_emotion)
print("Recall: ", recall_dt_emotion)
print("F1 Score: ", f1_score_dt_emotion)

# Combine emotion and sentiment labels for testing
y_test_combined = y_test_emotion + "_" + y_test_sentiment
y_pred_combined = y_pred_dt_emotion + "_" + y_pred_dt_sentiment

# Calculate metrics for combined labels
accuracy_dt_combined = accuracy_score(y_test_combined, y_pred_combined)
precision_dt_combined = precision_score(y_test_combined, y_pred_combined, average='weighted')
recall_dt_combined = recall_score(y_test_combined, y_pred_combined, average='weighted')
f1_score_dt_combined = f1_score(y_test_combined, y_pred_combined, average='weighted')

print("\nDecision Tree for Combined Labels:")
print("Accuracy: ", accuracy_dt_combined)
print("Precision: ", precision_dt_combined)
print("Recall: ", recall_dt_combined)
print("F1 Score: ", f1_score_dt_combined)



Decision Tree for Sentiment:
Accuracy:  0.5613333333333334
Precision:  0.5660157203885138
Recall:  0.5613333333333334
F1 Score:  0.5628611988894069

Decision Tree for Emotion:
Accuracy:  0.52
Precision:  0.5452321996286087
Recall:  0.52
F1 Score:  0.520391663928907

Decision Tree for Combined Labels:
Accuracy:  0.34
Precision:  0.46635268008046205
Recall:  0.34
F1 Score:  0.38512234311985843


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
