In [18]:
# Install the missing libraries directly from Jupyter
!pip install textblob beautifulsoup4 pandas numpy nltk openpyxl




In [None]:
#Import libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from textblob import TextBlob
import sys
import os

In [20]:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score

In [23]:
# ---------------------------------------------------------
# 1. Import Libraries & Print Versions
# ---------------------------------------------------------
print(f"Pandas version: {pd.__version__}")
print(f"Numpy version: {np.__version__}")
print(f"NLTK version: {nltk.__version__}")
print(f"Python version: {sys.version}")

# Download necessary NLTK data (run once)
nltk.download('stopwords')
nltk.download('punkt')

Pandas version: 2.2.2
Numpy version: 1.26.4
NLTK version: 3.9.1
Python version: 3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 13:17:27) [MSC v.1929 64 bit (AMD64)]


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kamle\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kamle\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [25]:
# ---------------------------------------------------------
# 2. Load Data
# ---------------------------------------------------------
# The dataset appears to lack headers, so we added the columns manually in excel
#data = pd.read_csv("/content/all-data.xlsx")
df = pd.read_excel('E:/NLP/Take_home assignment/all-data.xlsx')

if len(df.columns) == 2:
    df.columns = ['Sentiment', 'Headline']

print("Data Loaded. Shape:", df.shape)
print(df.head())

Data Loaded. Shape: (4846, 2)
  Sentiment                                           Headline
0   neutral  According to Gran , the company has no plans t...
1   neutral  Technopolis plans to develop in stages an area...
2  negative  The international electronic industry company ...
3  positive  With the new production plant the company woul...
4  positive  According to the company 's updated strategy f...


In [32]:
# Convert text to lowercase
# Apply only to the 'Headline' column
df['Headline'] = df['Headline'].str.lower()
df['Sentiment'] = df['Sentiment'].str.lower()

In [33]:
print(df.head())

  Sentiment                                           Headline
0   neutral  according to gran , the company has no plans t...
1   neutral  technopolis plans to develop in stages an area...
2  negative  the international electronic industry company ...
3  positive  with the new production plant the company woul...
4  positive  according to the company 's updated strategy f...


In [34]:
# g. Acronym Dictionary (added based on some observations from db)
acronym_dict = {
    "lol": "laugh out loud",
    "asap": "as soon as possible",
    "fyi": "for your information",
    "omg": "oh my god",
    "imo": "in my opinion",
    "eur": "euro",
    "usd": "dollar",
    "mn" : "million",
    "mln" : "million",
    "$" : "dollar",
    "%" : "percent",
    "bn" : "billion"
}

In [35]:
#Stop words check
from nltk.corpus import stopwords


", ".join(stopwords.words('english'))

"a, about, above, after, again, against, ain, all, am, an, and, any, are, aren, aren't, as, at, be, because, been, before, being, below, between, both, but, by, can, couldn, couldn't, d, did, didn, didn't, do, does, doesn, doesn't, doing, don, don't, down, during, each, few, for, from, further, had, hadn, hadn't, has, hasn, hasn't, have, haven, haven't, having, he, he'd, he'll, her, here, hers, herself, he's, him, himself, his, how, i, i'd, if, i'll, i'm, in, into, is, isn, isn't, it, it'd, it'll, it's, its, itself, i've, just, ll, m, ma, me, mightn, mightn't, more, most, mustn, mustn't, my, myself, needn, needn't, no, nor, not, now, o, of, off, on, once, only, or, other, our, ours, ourselves, out, over, own, re, s, same, shan, shan't, she, she'd, she'll, she's, should, shouldn, shouldn't, should've, so, some, such, t, than, that, that'll, the, their, theirs, them, themselves, then, there, these, they, they'd, they'll, they're, they've, this, those, through, to, too, under, until, up, 

In [37]:
# Critical for Sentiment Analysis: Remove negation words from the stopword list
# because "not happy" is very different from "happy".
STOPWORDS = set(stopwords.words('english'))

def remove_stopwords(text):
    negation_words = {
    'no', 'not', 'nor', 'neither', 'none', 'never',
    "don't", "aren't", "won't", "didn't", "couldn't",
    "doesn't", "mightn", "mightn't", "mustn't",
    "needn", "against", "needn't", "wouldn't"
}
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

In [56]:
# ---------------------------------------------------------
# 4. Cleaning Function (Steps a-c,g,d)
# ---------------------------------------------------------

def clean_text(text):
    if not isinstance(text, str):
        return str(text)

    # c. Remove HTML tags
    # BeautifulSoup is the most robust method for this
    text = BeautifulSoup(text, "html.parser").get_text()
    
    # b. Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # g. Replace Acronyms with words
    # We do this before removing special characters to match the acronyms correctly
    words = text.split()
    # Replace if word (upper) matches dict key
    words = [acronym_dict.get(word.lower(), word) for word in words]
    text = " ".join(words)

    #a. Remove special characters, numbers, and extra whitespace
    text = re.sub(r'[^a-z\s]', '', text)
  
    # e. Stopwords
    tokens = text.split()
    # Use 'my_stop_words' (the set), NOT 'stopwords' (the library)
    tokens = [w for w in tokens if w in remove_stopwords(text)]
    text = " ".join(tokens)
    return text


#Apply changes

print("Processing text cleaning...")
df['cleaned_headline'] = df['Headline'].apply(clean_text)

print("\n--- Success! Data Cleaned ---")
print(df[['Headline', 'cleaned_headline']].head())

Processing text cleaning...


  text = BeautifulSoup(text, "html.parser").get_text()



--- Success! Data Cleaned ---
                                            Headline  \
0  according to gran , the company has no plans t...   
1  technopolis plans to develop in stages an area...   
2  the international electronic industry company ...   
3  with the new production plant the company woul...   
4  according to the company 's updated strategy f...   

                                    cleaned_headline  
0  according gran company plans move production r...  
1  technopolis plans develop in stages an area no...  
2  international electronic industry company elco...  
3  the new production plant the company would inc...  
4  according company s updated strategy years bas...  


In [66]:
# ---------------------------------------------------------
# 5. Spellcheck (Step f)
# ---------------------------------------------------------
# Note: Spellchecking is computationally expensive (O(n)). 
# For large datasets (4000+ rows), this can take several minutes.
# We will define the function and apply it to a sample to demonstrate.

def apply_spellcheck(text):
    return str(TextBlob(text).correct())

# Apply to a small sample to verify functionality without waiting too long
print("\nApplying spellcheck to first 5 rows...")
df['final_text_sample'] = df['cleaned_headline'].head(5).apply(apply_spellcheck)


Applying spellcheck to first 5 rows...


In [None]:
df['cleaned_headline'] = df['cleaned_headline'].apply(apply_spellcheck)

In [71]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

# ---------------------------------------------------------
# 1. Prepare Data
# ---------------------------------------------------------

# Convert the dataset into two lists: texts and labels
X = df['cleaned_headline'].astype(str)
labels = df['Sentiment']

#Encode the sentiment
label_mapping = {"positive": 1, "negative": 0, "neutral": 2}
y = labels.map(label_mapping)

# Split the data (80% Train, 20% Test)
# It is best practice to split BEFORE vectorizing to prevent data leakage
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training Data: {X_train.shape[0]} rows")
print(f"Test Data: {X_test.shape[0]} rows")

Training Data: 3876 rows
Test Data: 970 rows


In [72]:
# ---------------------------------------------------------
# 2. MODEL A: Bag of Words (CountVectorizer)
# ---------------------------------------------------------
print("\n" + "="*50)
print("Model A: Bag of Words (CountVectorizer)")
print("="*50)

# Initialize Vectorizer
# max_features=5000 keeps only the top 5000 most frequent words
bow_vectorizer = CountVectorizer(max_features=5000) 

# Fit on Train, Transform both Train and Test
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

# --- Print Vectorization Arrays as requested ---
print("\nFeature Names (First 20 examples):")
print(bow_vectorizer.get_feature_names_out()[:20])

print("\nVectorization Array (First 5 rows):")
print(X_train_bow.toarray()[:5]) 

# Train Random Forest
rf_bow = RandomForestClassifier(n_estimators=100, random_state=42)
rf_bow.fit(X_train_bow, y_train)

# Predict & Evaluate
y_pred_bow = rf_bow.predict(X_test_bow)

print("\n--- BOW Results ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred_bow):.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_bow))


Model A: Bag of Words (CountVectorizer)

Feature Names (First 20 examples):
['ab' 'abb' 'abc' 'ability' 'able' 'abloy' 'abp' 'abroad' 'ac' 'acacia'
 'access' 'accessories' 'accident' 'accordance' 'according' 'account'
 'accountant' 'accounted' 'accounting' 'accounts']

Vectorization Array (First 5 rows):
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]

--- BOW Results ---
Accuracy: 0.7515
Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.45      0.55       110
           1       0.76      0.50      0.60       289
           2       0.75      0.94      0.84       571

    accuracy                           0.75       970
   macro avg       0.74      0.63      0.66       970
weighted avg       0.75      0.75      0.73       970



In [73]:
# ---------------------------------------------------------
# 3. MODEL B: TF-IDF Vectorizer
# ---------------------------------------------------------
print("\n" + "="*50)
print("Model B: TF-IDF")
print("="*50)

# Initialize Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Fit on Train, Transform both Train and Test
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# --- Print Vectorization Arrays as requested ---
print("\nFeature Names (First 20 examples):")
print(tfidf_vectorizer.get_feature_names_out()[:20])

print("\nVectorization Array (First 5 rows):")
print(X_train_tfidf.toarray()[:5])

# Train Random Forest
rf_tfidf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_tfidf.fit(X_train_tfidf, y_train)

# Predict & Evaluate
y_pred_tfidf = rf_tfidf.predict(X_test_tfidf)

print("\n--- TF-IDF Results ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred_tfidf):.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_tfidf))


Model B: TF-IDF

Feature Names (First 20 examples):
['ab' 'abb' 'abc' 'ability' 'able' 'abloy' 'abp' 'abroad' 'ac' 'acacia'
 'access' 'accessories' 'accident' 'accordance' 'according' 'account'
 'accountant' 'accounted' 'accounting' 'accounts']

Vectorization Array (First 5 rows):
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]

--- TF-IDF Results ---
Accuracy: 0.7557
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.43      0.54       110
           1       0.77      0.48      0.59       289
           2       0.75      0.96      0.84       571

    accuracy                           0.76       970
   macro avg       0.76      0.62      0.66       970
weighted avg       0.76      0.76      0.73       970



In [1]:
import sys
print(sys.version)

3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 13:17:27) [MSC v.1929 64 bit (AMD64)]


In [5]:
import bs4
print(bs4.__version__)

4.12.3


In [7]:
import spacy
print(spacy.__version__)
import sklearn
print(sklearn.__version__)

3.8.11
1.5.1
