In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
train = pd.read_excel('Data_Train.xlsx')
test = pd.read_excel('Data_Test.xlsx')
submission = pd.read_excel('Sample_submission.xlsx')


* Exploratory Data Analysis: A Simple analysis of Data
* Data Cleaning
* Data Preprocessing: Count Vectors and TF-IDF Vectors
* Training the Classifier
* Predicting For The Test set
* Submitting your solution at MachineHack

In [3]:
# Importing the Libraries

import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string

# Download the Following Modules once

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kesha\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kesha\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [4]:
print(train.shape)
train.head()

(7628, 2)


Unnamed: 0,STORY,SECTION
0,But the most painful was the huge reversal in ...,3
1,How formidable is the opposition alliance amon...,0
2,Most Asian currencies were trading lower today...,3
3,"If you want to answer any question, click on ‘...",1
4,"In global markets, gold prices edged up today ...",3


In [5]:
print(test.shape)
test.head()

(2748, 1)


Unnamed: 0,STORY
0,2019 will see gadgets like gaming smartphones ...
1,It has also unleashed a wave of changes in the...
2,It can be confusing to pick the right smartpho...
3,The mobile application is integrated with a da...
4,We have rounded up some of the gadgets that sh...


In [6]:
#Printing the group by description of each category

train.groupby('SECTION').describe()

Unnamed: 0_level_0,STORY,STORY,STORY,STORY
Unnamed: 0_level_1,count,unique,top,freq
SECTION,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,1686,1673,This story has been published from a wire agen...,4
1,2772,2731,This story has been published from a wire agen...,13
2,1924,1914,We will leave no stone unturned to make the au...,3
3,1246,1233,This story has been published from a wire agen...,11


## Data Cleaning

In [7]:
# Removing Duplicates to avoid Overfitting
train.drop_duplicates(inplace=True)

#A punctuations string for reference (added other valid characters from the dataset)

all_punctuations = string.punctuation + '‘’,:”][],'

#Method to remove punctuation marks from the data

def punc_remove(raw_text):
    no_punc = "".join([punc for punc in raw_text if punc not in all_punctuations])
    return no_punc

def stopword_remover(raw_text):
    words = raw_text.split()
    raw_text = " ".join([i for i in words if i not in stopwords.words('english')])
    return raw_text

lemmer = nltk.stem.WordNetLemmatizer()

def lem(words):
    return " ".join([lemmer.lemmatize(word,'v') for word in words.split()])


# All together 

def text_cleaner(raw):
    cleaned_text = stopword_remover(punc_remove(raw))
    return lem(cleaned_text)


In [8]:
#Applying the cleaner method to the entire data

train['CLEAN_STORY'] = train['STORY'].apply(text_cleaner)

In [9]:
from sklearn.feature_extraction.text import CountVectorizer


# Creating a bag of words Dictionery of words from the Data

bow_dictionery = CountVectorizer().fit(train['CLEAN_STORY'])

len(bow_dictionery.vocabulary_)

bow = bow_dictionery.transform(train['CLEAN_STORY'])

print(bow.shape)


from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer().fit(bow)

storytfidf = tfidf.transform(bow)



(7551, 35189)


### Training the Classifier

In [10]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB().fit(storytfidf, train['SECTION'])

### Predicting for the Test Data

In [11]:
#cleaning the test data

test['CLEAN_STORY'] = test['STORY'].apply(text_cleaner)

### Creating A Pipeline To Pre-Process The Data & Initialise The Classifier

In [12]:
#Importing the Pipeline module from sklearn
from sklearn.pipeline import Pipeline

#Initializing the pipeline with necessary transformations and the required classifier
pipe = Pipeline([('Bow', CountVectorizer()),
                ('TfIdf', TfidfTransformer()),
                ('Classifier',MultinomialNB())])


#Fitting the training data to the pipeline
pipe.fit(train['CLEAN_STORY'],train['SECTION'])

#Predicting the SECTION 
test_pred = pipe.predict(test['CLEAN_STORY'])

#Writing the predictions to an excel sheet
pd.DataFrame(test_pred, columns = ['SECTION']).to_excel('predictions.xlsx')


In [13]:
print(test['CLEAN_STORY'],test_pred)

0       2019 see gadgets like game smartphones wearabl...
1       It also unleash wave change MCU make sure futu...
2       It confuse pick right smartphone segregate top...
3       The mobile application integrate dashboard con...
4       We round gadgets show 2018 leave indelible mar...
                              ...                        
2743    According researchers fraud mobile channel gro...
2744    The iPhone XS XS Max share Apple A12 chipset a...
2745    On photography front Note 5 Pro feature 12MP 1...
2746    UDAY mandate discoms bring gap average revenue...
2747    Ripple also help bank customers send money peo...
Name: CLEAN_STORY, Length: 2748, dtype: object [1 2 1 ... 1 0 1]
