<a href="https://colab.research.google.com/github/iam-vsr/iam-vsr/blob/main/JG_ML_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Loading the Data**

In [23]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [24]:
file_path = '/content/drive/My Drive/AI_ML/train.tsv'


In [25]:
import pandas as pd

df = pd.read_csv(file_path, sep='\t')


# **EDA**

**About the Data**

In [26]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,subject,date,label
0,2619,Ex-CIA head says Trump remarks on Russia inter...,Former CIA director John Brennan on Friday cri...,politicsNews,"July 22, 2017",1
1,16043,YOU WON’T BELIEVE HIS PUNISHMENT! HISPANIC STO...,How did this man come to OWN this store? There...,Government News,"Jun 19, 2017",0
2,876,Federal Reserve governor Powell's policy views...,President Donald Trump on Thursday tapped Fede...,politicsNews,"November 2, 2017",1
3,19963,SCOUNDREL HILLARY SUPPORTER STARTS “TrumpLeaks...,Hillary Clinton ally David Brock is offering t...,left-news,"Sep 17, 2016",0
4,10783,NANCY PELOSI ARROGANTLY DISMISSES Questions on...,Pleading ignorance is a perfect ploy for Nancy...,politics,"May 26, 2017",0


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  30000 non-null  int64 
 1   title       30000 non-null  object
 2   text        30000 non-null  object
 3   subject     30000 non-null  object
 4   date        30000 non-null  object
 5   label       30000 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 1.4+ MB


In [28]:
df.describe()

Unnamed: 0.1,Unnamed: 0,label
count,30000.0,30000.0
mean,11207.9258,0.484067
std,6533.101217,0.499754
min,0.0,0.0
25%,5560.75,0.0
50%,11071.5,0.0
75%,16832.5,1.0
max,23479.0,1.0


In [29]:
df['Unnamed: 0']

Unnamed: 0.1,Unnamed: 0
0,2619
1,16043
2,876
3,19963
4,10783
...,...
29995,6880
29996,17818
29997,5689
29998,15805


In [32]:
# Dropping the 'Unnamed: 0' column
df = df.drop(columns=["Unnamed: 0"])


In [33]:
df.isnull().sum()

Unnamed: 0,0
title,0
text,0
subject,0
date,0
label,0


In [34]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,15478
1,14522


**Cleaning the Title and the Text**

In [35]:
import re
import string

def clean_text(text):
    # Remove special characters, URLs, punctuation, and extra spaces
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(f"[{string.punctuation}]", "", text)  # Remove punctuation
    text = re.sub(r"\s+", " ", text)  # Remove extra whitespace
    return text.strip().lower()  # Normalize case

# Apply to title and text columns
df['clean_title'] = df['title'].apply(clean_text)
df['clean_text'] = df['text'].apply(clean_text)


In [36]:
df.head()

Unnamed: 0,title,text,subject,date,label,clean_title,clean_text
0,Ex-CIA head says Trump remarks on Russia inter...,Former CIA director John Brennan on Friday cri...,politicsNews,"July 22, 2017",1,excia head says trump remarks on russia interf...,former cia director john brennan on friday cri...
1,YOU WON’T BELIEVE HIS PUNISHMENT! HISPANIC STO...,How did this man come to OWN this store? There...,Government News,"Jun 19, 2017",0,you won’t believe his punishment hispanic stor...,how did this man come to own this store there ...
2,Federal Reserve governor Powell's policy views...,President Donald Trump on Thursday tapped Fede...,politicsNews,"November 2, 2017",1,federal reserve governor powells policy views ...,president donald trump on thursday tapped fede...
3,SCOUNDREL HILLARY SUPPORTER STARTS “TrumpLeaks...,Hillary Clinton ally David Brock is offering t...,left-news,"Sep 17, 2016",0,scoundrel hillary supporter starts “trumpleaks...,hillary clinton ally david brock is offering t...
4,NANCY PELOSI ARROGANTLY DISMISSES Questions on...,Pleading ignorance is a perfect ploy for Nancy...,politics,"May 26, 2017",0,nancy pelosi arrogantly dismisses questions on...,pleading ignorance is a perfect ploy for nancy...


**Parsing the date as it's messed up**

In [45]:

from dateutil import parser

# Standardize date formats using a custom function
def parse_date(date):
    try:
        return parser.parse(date)  # Attempt to parse the date
    except Exception:
        return None  # Return None for unparseable dates

# Apply the function to the 'date' column
df['date_new'] = df['date'].apply(parse_date)

# Filter and display invalid dates
print("Number of invalid dates=",df['date_new'].isnull().sum(),'\n')
invalid_dates = df[df['date_new'].isnull()]['date'].tolist()
print("Invalid Dates:", invalid_dates)

Invalid Dates: ['https://fedup.wpengine.com/wp-content/uploads/2015/04/entitled.jpg', 'https://fedup.wpengine.com/wp-content/uploads/2015/04/hillarystreetart.jpg', 'https://fedup.wpengine.com/wp-content/uploads/2015/04/entitled.jpg', 'https://fedup.wpengine.com/wp-content/uploads/2015/04/hillarystreetart.jpg', 'https://100percentfedup.com/served-roy-moore-vietnamletter-veteran-sets-record-straight-honorable-decent-respectable-patriotic-commander-soldier/', 'MSNBC HOST Rudely Assumes Steel Worker Would Never Let His Son Follow in His Footsteps…He Couldn’t Be More Wrong [Video]', 'https://100percentfedup.com/video-hillary-asked-about-trump-i-just-want-to-eat-some-pie/', 'https://fedup.wpengine.com/wp-content/uploads/2015/04/entitled.jpg']


In [46]:
invalid_date_mask = df['date'].isin(invalid_dates)
df = df[~invalid_date_mask]

In [56]:
df['date'] = df['date_new']
df = df.drop(columns=['date_new'])
df.head()

Unnamed: 0,title,text,subject,date,label,clean_title,clean_text
0,Ex-CIA head says Trump remarks on Russia inter...,Former CIA director John Brennan on Friday cri...,politicsNews,2017-07-22,1,excia head says trump remarks on russia interf...,former cia director john brennan on friday cri...
1,YOU WON’T BELIEVE HIS PUNISHMENT! HISPANIC STO...,How did this man come to OWN this store? There...,Government News,2017-06-19,0,you won’t believe his punishment hispanic stor...,how did this man come to own this store there ...
2,Federal Reserve governor Powell's policy views...,President Donald Trump on Thursday tapped Fede...,politicsNews,2017-11-02,1,federal reserve governor powells policy views ...,president donald trump on thursday tapped fede...
3,SCOUNDREL HILLARY SUPPORTER STARTS “TrumpLeaks...,Hillary Clinton ally David Brock is offering t...,left-news,2016-09-17,0,scoundrel hillary supporter starts “trumpleaks...,hillary clinton ally david brock is offering t...
4,NANCY PELOSI ARROGANTLY DISMISSES Questions on...,Pleading ignorance is a perfect ploy for Nancy...,politics,2017-05-26,0,nancy pelosi arrogantly dismisses questions on...,pleading ignorance is a perfect ploy for nancy...


In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 29992 entries, 0 to 29999
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   title        29992 non-null  object        
 1   text         29992 non-null  object        
 2   subject      29992 non-null  object        
 3   date         29992 non-null  datetime64[ns]
 4   label        29992 non-null  int64         
 5   clean_title  29992 non-null  object        
 6   clean_text   29992 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(5)
memory usage: 1.8+ MB


In [59]:
# Length of the title (word count)
df['title_word_count'] = df['clean_title'].apply(lambda x: len(x.split()))

# Sentiment of the title
from textblob import TextBlob
df['title_sentiment'] = df['clean_title'].apply(lambda x: TextBlob(x).sentiment.polarity)


In [60]:
df.head()

Unnamed: 0,title,text,subject,date,label,clean_title,clean_text,title_word_count,title_sentiment
0,Ex-CIA head says Trump remarks on Russia inter...,Former CIA director John Brennan on Friday cri...,politicsNews,2017-07-22,1,excia head says trump remarks on russia interf...,former cia director john brennan on friday cri...,9,0.0
1,YOU WON’T BELIEVE HIS PUNISHMENT! HISPANIC STO...,How did this man come to OWN this store? There...,Government News,2017-06-19,0,you won’t believe his punishment hispanic stor...,how did this man come to own this store there ...,19,0.5
2,Federal Reserve governor Powell's policy views...,President Donald Trump on Thursday tapped Fede...,politicsNews,2017-11-02,1,federal reserve governor powells policy views ...,president donald trump on thursday tapped fede...,10,0.6
3,SCOUNDREL HILLARY SUPPORTER STARTS “TrumpLeaks...,Hillary Clinton ally David Brock is offering t...,left-news,2016-09-17,0,scoundrel hillary supporter starts “trumpleaks...,hillary clinton ally david brock is offering t...,7,0.0
4,NANCY PELOSI ARROGANTLY DISMISSES Questions on...,Pleading ignorance is a perfect ploy for Nancy...,politics,2017-05-26,0,nancy pelosi arrogantly dismisses questions on...,pleading ignorance is a perfect ploy for nancy...,13,0.0
