In [1]:
#Import the required packages for your project
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix

In [2]:
# Load the dataset,read the dataset and create a new dataframe out of that
# why did I pass a parameter called chunksize inside pd.read_csv()
# since the  dataset is to large,it can't be read at once so we are reading it in small parts(at once it will read 10000 rows out of the dataset)
df=pd.read_csv("test.csv", chunksize=10000)


In [3]:
# After reading the data we are combining all the data by concatination
df=pd.concat(df)

In [4]:
df

Unnamed: 0,text,sentiment
0,"My daughter liked it but I was aghast, that a ...",neg
1,I... No words. No words can describe this. I w...,neg
2,this film is basically a poor take on the old ...,neg
3,"This is a terrible movie, and I'm not even sur...",neg
4,First of all this movie is a piece of reality ...,pos
...,...,...
24995,"For one thing, he produced this movie. It has ...",neg
24996,The title comes from an alteration an adolesce...,pos
24997,Christopher Nolan's first film is a 'no budget...,pos
24998,The story is shortly about the faith-lacking b...,neg


In [5]:
# Checking the null values present in the dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       25000 non-null  object
 1   sentiment  25000 non-null  object
dtypes: object(2)
memory usage: 390.8+ KB


In [6]:
# Checking the column names
df.columns

Index(['text', 'sentiment'], dtype='object')

In [7]:
# Initialing the independent and dependent(Target variable) columns to X and Y
x=df["text"]
y=df["sentiment"]
# Splitting the values for training and testing
xtrain,xtest,ytrain,ytest=train_test_split(x,y,train_size=0.8,random_state=42)

In [8]:
# Why did I choose the TFIDF vectorizer?
# because my X is having a sentence based column,by using this TFIDF vectorizer it will identify
# all the unique words in sentence and create that many columns.
# creation of model
TF_IDF=TfidfVectorizer()

In [9]:
# Here the model learns the patterns.
xtrain_tf_idf=TF_IDF.fit_transform(xtrain)
xtrain_tf_idf
# Here it will return the result in the form of sparse matrix.

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2705957 stored elements and shape (20000, 67349)>

In [10]:
# To view that first we have to convert to the array by using toarray().
vectorizer=xtrain_tf_idf.toarray()
# To get those unique column names we can use get_feature_names_out().
print(TF_IDF.get_feature_names_out())

['00' '000' '00000001' ... 'þorleifsson' 'יגאל' 'כרמון']


In [11]:
# Here we are using the for loop to get those words present in text column
for i in df['text']:
  # if the word is present in i then it will print the word.
  if "orleifsson " in i:
    print(i)

Badly shot, badly edited, clumsy dialogue, flat characters, unsuccessful adaption of a novel. It doesn't really get much worse. Decent acting and good popcorn saved me for this hour and a half - felt more like three hours - of boredom. Occasional good one-liners.<br /><br />David is a dim-witted young man, who has never recovered from losing his brother at an early age. He puts his faith in a Chinese philosophy mumbo jumbo video, although that doesn't seem to help him much in real life. David is a member of a debt-collectors gang, where every member has an IQ below sea level. A lacking script, along with uncreative shooting and even worse editing, make what could have been quite funny incidents of failed debt-collecting look like a amateurish homemade reality show.<br /><br />David rents an apartment from an elementary school teacher, Haraldur, who by dropping a couple of sentences about his own importance convinces David he is dealing with the most dangerous criminal in Iceland. A sev

In [12]:
# Here we are using logistic regression because it is a classification problem.
model=LogisticRegression()

In [13]:
# Training the model using fit
model.fit(xtrain_tf_idf,ytrain)

In [14]:
# Checking  how the model is performing on trained data by using model.score.
model.score(xtrain_tf_idf,ytrain)

0.93785

In [15]:
# Here we are applying the same transformation for xtest,if we are not doing this.Then we may get a error.
xtest_tf_idf=TF_IDF.transform(xtest)
xtest_tf_idf.toarray()


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [16]:
# Checking  how the model is performing on test data by using model.score.
model.score(xtest_tf_idf,ytest)

0.8904

In [17]:
# Creating a new  input to predict and test the model performance.
new_input=["The movie I watched was not good"]

In [18]:
# Here we are applying the same transformation for new input
new_test=TF_IDF.transform(new_input)

res = model.predict(new_test)

In [19]:
#Making the prediction for test data.
ypred=model.predict(xtest_tf_idf)
ypred

array(['pos', 'neg', 'pos', ..., 'neg', 'pos', 'pos'], dtype=object)

In [20]:
# Checking the model accuracy
accuracy_score(ytest,ypred)


0.8904

In [21]:
# Making the prediction for new input.
predicted_input=model.predict(new_test)
print(predicted_input)

['neg']


In [22]:
# checking using confusion matrix
confusion_matrix(ytest,ypred)

array([[2184,  317],
       [ 231, 2268]])

In [23]:
df.value_counts("sentiment")

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
neg,12500
pos,12500


In [24]:
# SENTIMENTAL ANALYSIS USING LLM
import os
import requests

API_URL = "https://router.huggingface.co/hf-inference/models/tabularisai/multilingual-sentiment-analysis"
headers = {
    "Authorization": f"Bearer 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"
}

def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()

output = query({
    "inputs": "The movie I watched was not good",
})
print(output)

[[{'label': 'Negative', 'score': 0.6506085395812988}, {'label': 'Very Negative', 'score': 0.1631215363740921}, {'label': 'Neutral', 'score': 0.14427414536476135}, {'label': 'Positive', 'score': 0.023288456723093987}, {'label': 'Very Positive', 'score': 0.018707331269979477}]]


In [25]:
# TEXT GENERATION LLM
import os
import requests

API_URL = "https://router.huggingface.co/together/v1/chat/completions"
headers = {
    "Authorization": f"Bearer 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"
}

def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()

response = query({
    "messages": [
        {
            "role": "user",
            "content": f"{new_input} why did the model predict {res}"
        }
    ],
    "model": "mistralai/Mistral-7B-Instruct-v0.3"
})

print(response['choices'][0]['message']['content'])

 The model predicted 'neg' because the sentence you provided expresses a negative sentiment about the movie you watched. The model is trained to identify and classify sentiments as positive, negative, or neutral based on the words and phrases used in the text. In this case, the word 'not' and the phrase 'was not good' indicate a negative sentiment, which is why the model predicted 'neg'.


<h1><u>DOCUMENTATION</u>:Explanation for Sentiment Classification Output</h1>

##<u>Objective</u>:This document explains why a sentiment classification model predicted a label of 'NEGATIVE' for a given input sentence.

###<u>Input Sentence</u>:"I didn't like the movie. It was not good."

###<u>Model Prediction</u>:Predicted Sentiment: NEGATIVE

###<u>Explanation</u>:The model predicted 'NEGATIVE' because the input sentence contains linguistic patterns commonly associated with negative sentiment. Sentiment classification models are trained on large datasets containing labeled examples of text associated with positive, negative, or neutral emotions.

###<u>ReasoningWords and Phrases Detected</u>:
"didn't like": Indicates dislike, a strong negative emotion.
"not good": Negates a positive adjective, turning it into a negative sentiment.

###<u>Model Behavior</u>:The model identifies sentiment using pattern recognition, paying attention to negative modifiers like "not" and verbs like "didn't like".
These phrases appear frequently in training data labeled as negative, so the model learns to associate them with negative emotions.

###<u>Classification Logic</u>:The presence of negative expressions strongly influences the model’s confidence in labeling the sentence as negative.

Even if the sentence is grammatically complex, the model's language understanding is sufficient to detect the overall emotional tone.

#<u>CONCLUSION</u>:
The model predicted 'NEGATIVE' because the sentence expresses a clear negative opinion. It contains phrases like "didn't like" and "was not good", which reflect dissatisfaction. These expressions are strongly associated with negative sentiment in the training data.

**bold text**




