In [None]:
import os
import sys
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re 
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score 
print('Dependencies Loaded Successfully')

# 1. Loading dataset

In [None]:
news_df = pd.read_csv('data/train.csv')
news_test = pd.read_csv('data/test.csv')
submit = pd.read_csv('data/submit.csv')

In [None]:
news_df.sample(5)

In [None]:
news_test.sample(5)

In [None]:
news_df.shape

In [None]:
news_test.shape

In [None]:
news_df[['id']].drop_duplicates().shape

In [None]:
news_test.shape

# 2. DataFrame summary statistics

In [None]:
news_df['length'] = news_df['text'].str.len()

In [None]:
news_df.head()

In [None]:
news_df.columns

In [None]:
news_df.dtypes

In [None]:
news_df.info()

In [None]:
news_df.describe().T

# 3. Checking for missing data

In [None]:
news_df.isna().sum()

In [None]:
print('Percent of missing "Title" records is %.2f%%' %((news_df['title'].isnull().sum()/news_df.shape[0])*100))

In [None]:
print('Percent of missing "Author" records is %.2f%%' %((news_df['author'].isnull().sum()/news_df.shape[0])*100))

In [None]:
print('Percent of missing "Text" records is %.2f%%' %((news_df['text'].isnull().sum()/news_df.shape[0])*100))

In [None]:
news_df.isna().sum()

In [None]:
news_df = news_df.fillna('')
news_test = news_test.fillna('')

In [None]:
news_df.isnull().sum()

# 4.Classification Plan 

In [None]:
news_df['total'] = news_df['author']+' '+news_df['title']
news_test['total'] = news_test['author']+' '+news_test['title']

In [None]:
X = news_df.drop(columns='label', axis=1)
Y = news_df['label']

In [None]:
X

In [None]:
Y

In [None]:
port_stem = PorterStemmer()

In [None]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [None]:
news_df['total'] = news_df['total'].apply(stemming)


In [None]:
news_test['total'] = news_test['total'].apply(stemming)

In [None]:
print(news_df['total'])

In [None]:
X = news_df['total'].values
Y = news_df['label'].values
X_test = news_test['total'].values
Y_test = submit['label'].values

In [None]:
print(X)

In [None]:
print(Y)

In [None]:
vectorizer = TfidfVectorizer()
vectorizer.fit(X)
X = vectorizer.transform(X)

In [None]:
vectorizer = TfidfVectorizer()
vectorizer.fit(X_test)
X_test = vectorizer.transform(X_test)

In [None]:
model = LogisticRegression()

In [None]:
model.fit(X, Y)

In [None]:
X_train_prediction = model.predict(X)

In [None]:
training_data_accuracy = accuracy_score(X_train_prediction, Y)

In [None]:
print('Accuracy score of training data :', training_data_accuracy)