In [1]:
# Importing Libraries
import pandas as pd 
import numpy as np 
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

In [2]:
# Loading dataset
data = pd.read_csv("test.tsv", sep='\t')

In [3]:
# Checking for missing values
print(data.isnull().sum())
data = data.dropna()

Unnamed: 0    0
title         0
text          0
subject       0
date          0
label         0
dtype: int64


In [4]:
# Remove the unnecessary 'Unnamed: 0' column
data = data.drop(columns=['Unnamed: 0'])

In [5]:
# Removing duplicate rows
data = data.drop_duplicates()

In [None]:
# Initializing neccessary NLP tools
nltk.download('stopwords')
nltk.download('wordnet')
nlp = spacy.load('en_core_web_sm')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\zanwa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zanwa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\zanwa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [8]:
def clean_text(text):
    if not isinstance(text, str):  # Ensure the input is a string
        return ""
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # Use spaCy's tokenizer
    doc = nlp(text)
    # Remove stopwords
    tokens = [token.text for token in doc if token.text not in stop_words]
    # Lemmatize tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

In [9]:
# applying the cleaning function to title and text columns
data['cleaned_title'] = data['title'].apply(clean_text)

In [10]:
data['cleaned_text'] = data['text'].apply(clean_text)

In [11]:
print(data[['title', 'cleaned_title', 'text', 'cleaned_text']].head())

                                               title  \
0  Conservatives Will HATE What Donald Trump Just...   
1  Trump victory may create new tension between U...   
2  WATCH: Hundreds of ILLEGAL ALIENS Storm Senate...   
3  Democratic Senator Franken to resign: CNN, cit...   
4  GANG OF DOMESTIC TERRORISTS Violently Attack L...   

                                       cleaned_title  \
0  conservative hate donald trump said planned pa...   
1  trump victory may create new tension u islam i...   
2  watch hundred illegal alien storm senate build...   
3  democratic senator franken resign cnn citing s...   
4  gang domestic terrorist violently attack lone ...   

                                                text  \
0  Donald Trump isn t exactly a stranger to makin...   
1  Donald Trump’s U.S. election victory may creat...   
2  A couple of quick questions come to mind when ...   
3  U.S. Democratic Senator Al Franken will announ...   

                                        clean

In [12]:
data['title_length'] = data['cleaned_title'].apply(lambda x: len(x.split()))

In [13]:
data['text_length'] = data['cleaned_text'].apply(lambda x: len(x.split()))

In [14]:
from textblob import TextBlob
data['title_sentiment'] = data['cleaned_title'].apply(lambda x: TextBlob(x).sentiment.polarity)
data['text_sentiment'] = data['cleaned_text'].apply(lambda x: TextBlob(x).sentiment.polarity)

In [16]:
from sklearn.model_selection import train_test_split

X = data[['cleaned_title', 'cleaned_text', 'title_length', 'text_length']]
y = data['label']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
# Adjust max_features based on your dataset size
X_train_tfidf = vectorizer.fit_transform(X_train['cleaned_text'])
X_val_tfidf = vectorizer.transform(X_val['cleaned_text'])


In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

model = RandomForestClassifier(random_state=42)
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_val_tfidf)
print(classification_report(y_val, y_pred))


              precision    recall  f1-score   support

           0       0.96      0.96      0.96       850
           1       0.95      0.96      0.96       802

    accuracy                           0.96      1652
   macro avg       0.96      0.96      0.96      1652
weighted avg       0.96      0.96      0.96      1652



In [20]:
train_data = pd.read_csv('train.tsv', sep='\t')