In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re

import nltk

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.model_selection import cross_val_score

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as xgb

In [None]:
df = pd.read_csv('/content/train.csv')
df

Unnamed: 0,ID,title,text,subject,date,class
0,0.0,#AfterTrumpImplodes Hashtag Hilariously Imagi...,What will the world be like post-Donald Trump?...,News,5-Aug-16,0.0
1,1.0,#BlackLivesMatter Leader To Run For Mayor Of ...,The police shooting of black teen Michael Brow...,News,4-Feb-16,0.0
2,2.0,#BringBackObama Hashtag Blows Up On Twitter A...,The six months since President Donald Trump wa...,News,13-Jul-17,0.0
3,3.0,#FreeChrisChristie: Twitter Reacts To The ‘Ho...,"Last Friday, New Jersey Governor Chris Christi...",News,2-Mar-16,0.0
4,4.0,#MakeAmericaBrannigan: Futurama Voice Actor R...,"The incredibly talented voice actor, Billy Wes...",News,13-Aug-16,0.0
...,...,...,...,...,...,...
44914,,,,,,
44915,,,,,,
44916,,,,,,
44917,,,,,,


In [None]:
df = df.drop_duplicates()
df

In [None]:
df = df.dropna(subset=["class"])
df

In [None]:
df.dtypes

In [None]:
df.isnull().sum()

ID         21
title       0
text        0
subject     0
date        0
class       0
dtype: int64

In [None]:
df = df.drop(columns=["ID"])
df

In [None]:
sns.countplot(x='class', data=df, palette=['g','r'])

In [None]:
print(df['subject'].unique())

In [None]:
df['subject'] = df['subject'].apply(lambda x: 'politics' if x == 'politicsNews' else x)
df['subject'].unique()

In [None]:
politics_news = df[df['subject'] == 'politics']
left_news = df[df['subject'] == 'left-news']

# Display sample titles and text for comparison
print(politics_news[['title', 'text']].head())
print(left_news[['title', 'text']].head())


In [None]:
df['subject'] = df['subject'].apply(lambda x: 'politics' if x == 'left-news' else x)
print(df['subject'].unique())

In [None]:
def clean_date(date):
    try:
        return pd.to_datetime(date, format='%d-%b-%y')
    except (ValueError, TypeError):
        return np.nan

In [None]:
df['date'] = df['date'].apply(clean_date)
df = df.dropna(subset=['date'])

In [None]:
df = df.drop_duplicates(keep='first')
df

In [None]:
print(df['subject'].unique())

In [None]:
world_news = df[df['subject'] == 'worldnews']
US_news = df[df['subject'] == 'US_News']
Middle_east = df[df['subject'] == 'Middle-east']
gov = df[df['subject'] == 'Government News']

# Display sample titles and text for comparison
print(world_news[['title', 'text']].head())
print("------------------")
print(US_news[['title', 'text']].head())
print("------------------")
print(Middle_east[['title', 'text']].head())
print("------------------")
print(gov[['title', 'text']].head())

There are also duplicates in text and title

In [None]:
df = df.drop_duplicates(subset=['text'], keep='first')
df = df.drop_duplicates(subset=['title'], keep='first')
df

In [None]:
min_date = df['date'].min()
max_date = df['date'].max()

print(min_date)
print(max_date)

In [None]:
date_range = max_date - min_date
date_range

We can classify the date to 16 time class and each class has 66 values.

In [None]:
date_ranges = []
curr_start_date = min_date
range_duration = pd.Timedelta(days=66)

while curr_start_date < max_date:
    curr_end_date = curr_start_date + range_duration
    if curr_end_date > max_date:
      curr_end_date = max_date
    date_ranges.append((curr_start_date, curr_end_date))
    curr_start_date = curr_end_date

date_ranges

In [None]:
def assign_date_range(date):
    for i, (start, end) in enumerate(date_ranges):
        if start <= date <= end:
            return i
    return 'What?'

In [None]:
df['date_range'] = df['date'].apply(assign_date_range)
df

In [None]:
df['date_range'].value_counts()

In [None]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    ps = PorterStemmer()
    tokens = [ps.stem(word) for word in tokens]
    return ' '.join(tokens)

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
df['title'] = df['title'].apply(preprocess_text)
df['text'] = df['text'].apply(preprocess_text)

In [None]:
df['title_length'] = df['title'].apply(lambda x: len(x.split()))
df['text_length'] = df['text'].apply(lambda x: len(x.split()))

In [None]:
le = LabelEncoder()
df['subject'] = le.fit_transform(df['subject'])

In [None]:
df['combined_title_and_text'] = df['title'] + ' ' + df['text']

In [None]:
df['combined_title_and_text'] = df['combined_title_and_text'].astype(str)

In [None]:
X = df[['combined_title_and_text', 'subject', 'date_range']]
y = df['class']

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(max_features=5000, ngram_range=(1, 2)), 'combined_title_and_text'),
        ('others', OneHotEncoder(handle_unknown='ignore'), ['subject', 'date_range'])
    ],
    remainder='passthrough'
)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [None]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train['combined_title_and_text'])
X_test_tfidf = vectorizer.transform(X_test['combined_title_and_text'])

In [None]:
model_rf = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=1000, class_weight='balanced'))
])

model_gb = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier(n_estimators=1000))
])

model_xgb = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', xgb.XGBClassifier())
])

In [None]:
model_rf.fit(X_train, y_train)

In [None]:
model_gb.fit(X_train, y_train)

In [None]:
model_xgb.fit(X_train, y_train)

In [None]:
columns=['LogisticRegression','RandomForestClassifier','GradientBoostingClassifier','XGBoost']
result1=[]
result2=[]
result3=[]

In [None]:
def cal(model):
    y_pred = model.predict(X_test)
    acc=accuracy_score(y_test, y_pred)
    CR=classification_report(y_test, y_pred)
    cv_scores = cross_val_score(model, X_train, y_train, cv=5)
    MeanCV=np.mean(cv_scores)

    result1.append(acc)
    result2.append(cv_scores)
    result3.append(MeanCV)

    print(model)
    print('Accuracy: ', acc)
    print('Classification Report:\n', CR)
    print(f"Cross-Validation Scores: {cv_scores}")
    print(f"Mean CV Score: {np.mean(cv_scores)}")

In [47]:
cal(model_rf)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('text',
                                                  TfidfVectorizer(max_features=5000,
                                                                  ngram_range=(1,
                                                                               2)),
                                                  'combined_title_and_text'),
                                                 ('others',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['subject', 'date_range'])])),
                ('classifier',
                 RandomForestClassifier(class_weight='balanced',
                                        n_estimators=1000))])
Accuracy:  0.998274374460742
Classification Report:
               precision    recall  f1-score   support

         0.0       1.00

In [None]:
cal(model_gb)

In [50]:
cal(model_xgb)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('text',
                                                  TfidfVectorizer(max_features=5000,
                                                                  ngram_range=(1,
                                                                               2)),
                                                  'combined_title_and_text'),
                                                 ('others',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['subject', 'date_range'])])),
                ('classifier',
                 XGBClassifier(base_score=None, booster=None, callbacks=None,
                               colsample_bylevel=None, c...
                               feature_types=None, gamma=None, grow_policy=None,
                               importan

In [51]:
testing_df = pd.read_csv('/content/test.csv')
testing_df

Unnamed: 0,ID,title,text,subject,date
0,1,U.S. judge presses Trump administration on Dre...,NEW YORK (Reuters) - A U.S. judge on Thursday ...,politicsNews,21-Sep-17
1,2,U.S. judge questions government on Trump's lat...,"GREENBELT, Md. (Reuters) - A U.S. judge on Mon...",politicsNews,16-Oct-17
2,3,U.S. judge questions states seeking to restore...,SAN FRANCISCO (Reuters) - A U.S. judge on Mond...,politicsNews,23-Oct-17
3,4,U.S. judge refuses to halt New York nuclear po...,NEW YORK (Reuters) - A federal judge on Tuesda...,politicsNews,25-Jul-17
4,5,U.S. judge rejects bid to dismiss Indonesian i...,BOSTON (Reuters) - A federal judge on Wednesda...,worldnews,25-Oct-17
...,...,...,...,...,...
4491,4492,Zuckerberg again rejects claims of Facebook im...,NEW YORK (Reuters) - Facebook Inc chief execut...,politicsNews,13-Nov-16
4492,4493,Zuma given November 30 deadline ahead of South...,JOHANNESBURG (Reuters) - South Africa s Nation...,worldnews,20-Oct-17
4493,4494,Zuma says South Africa and Morocco will resume...,JOHANNESBURG (Reuters) - South Africa and Moro...,worldnews,3-Dec-17
4494,4495,Zuma says South Africa's free higher education...,JOHANNESBURG (Reuters) - South Africa s govern...,worldnews,16-Dec-17


In [52]:
testing_df['subject'] = testing_df['subject'].apply(lambda x: 'politics' if x == 'politicsNews' else x)
testing_df['subject'] = testing_df['subject'].apply(lambda x: 'politics' if x == 'left-news' else x)
testing_df['date'] = testing_df['date'].apply(clean_date)
testing_df['date_range'] = testing_df['date'].apply(assign_date_range)
testing_df['title'] = testing_df['title'].apply(preprocess_text)
testing_df['text'] = testing_df['text'].apply(preprocess_text)
testing_df['subject'] = le.fit_transform(testing_df['subject'])
testing_df['combined_title_and_text'] = testing_df['title'] + ' ' + testing_df['text']
testing_df['combined_title_and_text'] = testing_df['combined_title_and_text'].astype(str)
X_testing = testing_df[['combined_title_and_text', 'subject', 'date_range']]

In [54]:
y_testing_pred_rf = model_rf.predict(X_testing)
y_testing_pred_gb = model_gb.predict(X_testing)
y_testing_pred_xgb = model_xgb.predict(X_testing)


In [55]:
model_predictions = {
    'Random Forest': y_testing_pred_rf,
    'Gradient Boosting': y_testing_pred_gb,
    'XGBoost': y_testing_pred_xgb
}

for model_name, y_pred in model_predictions.items():
    count0 = 0
    count1 = 0
    for i in y_pred:
        if i == 0:
            count0 += 1
        elif i == 1:
            count1 += 1
    print(f"{model_name}:")
    print(f"Class 0 count: {count0}")
    print(f"Class 1 count: {count1}\n")

Random Forest:
Class 0 count: 2203
Class 1 count: 2293



In [56]:
testing_df['TARGET'] = y_testing_pred_rf
testing_df

Unnamed: 0,ID,title,text,subject,date,date_range,combined_title_and_text,TARGET
0,1,us judg press trump administr dreamer deadlin,new york reuter us judg thursday repeatedli pr...,5,2017-09-21,13,us judg press trump administr dreamer deadlin ...,1.0
1,2,us judg question govern trump latest travel ban,greenbelt md reuter us judg monday question at...,5,2017-10-16,14,us judg question govern trump latest travel ba...,1.0
2,3,us judg question state seek restor obamacar pa...,san francisco reuter us judg monday appear ske...,5,2017-10-23,14,us judg question state seek restor obamacar pa...,1.0
3,4,us judg refus halt new york nuclear power plan...,new york reuter feder judg tuesday dismiss law...,5,2017-07-25,12,us judg refus halt new york nuclear power plan...,1.0
4,5,us judg reject bid dismiss indonesian immigr l...,boston reuter feder judg wednesday declin dism...,6,2017-10-25,14,us judg reject bid dismiss indonesian immigr l...,1.0
...,...,...,...,...,...,...,...,...
4491,4492,zuckerberg reject claim facebook impact us elect,new york reuter facebook inc chief execut mark...,5,2016-11-13,8,zuckerberg reject claim facebook impact us ele...,1.0
4492,4493,zuma given novemb 30 deadlin ahead south afric...,johannesburg reuter south africa nation prosec...,6,2017-10-20,14,zuma given novemb 30 deadlin ahead south afric...,1.0
4493,4494,zuma say south africa morocco resum diplomat t...,johannesburg reuter south africa morocco resum...,6,2017-12-03,14,zuma say south africa morocco resum diplomat t...,1.0
4494,4495,zuma say south africa free higher educ done fi...,johannesburg reuter south africa govern plan o...,6,2017-12-16,15,zuma say south africa free higher educ done fi...,1.0


In [57]:
submission_csv = testing_df[['ID', 'TARGET']]
submission_csv

Unnamed: 0,ID,TARGET
0,1,1.0
1,2,1.0
2,3,1.0
3,4,1.0
4,5,1.0
...,...,...
4491,4492,1.0
4492,4493,1.0
4493,4494,1.0
4494,4495,1.0


In [58]:
submission_csv.to_csv("submission_csv.csv",index=False)