# Text Representation Using Bag Of n-grams

In [59]:
#import paskages
import pandas as pd


from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

import spacy
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
v=CountVectorizer()
## Fitting the vectorizer on a single text document
v.fit(['Karam Alhanatleh is looking for a house'])
v.vocabulary_

{'karam': 4, 'alhanatleh': 0, 'is': 3, 'looking': 5, 'for': 1, 'house': 2}

In [3]:
v=CountVectorizer(ngram_range=(2,2))
v.fit(['Karam Alhanatleh is looking for a house'])
v.vocabulary_

{'karam alhanatleh': 3,
 'alhanatleh is': 0,
 'is looking': 2,
 'looking for': 4,
 'for house': 1}

In [4]:
v=CountVectorizer(ngram_range=(1,2))
v.fit(['Karam Alhanatleh is looking for a house'])
v.vocabulary_

{'karam': 7,
 'alhanatleh': 0,
 'is': 5,
 'looking': 9,
 'for': 2,
 'house': 4,
 'karam alhanatleh': 8,
 'alhanatleh is': 1,
 'is looking': 6,
 'looking for': 10,
 'for house': 3}

In [5]:
#######################

In [6]:
nlp=spacy.load("en_core_web_sm")

In [7]:
def preprocess(text):
    doc=nlp(text)
    
    filter_tokens=[]
    
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filter_tokens.append(token.lemma_)
    return " ".join(filter_tokens)

In [8]:
preprocess("Karam Alhanatleh is looking for a house")

'Karam Alhanatleh look house'

In [9]:
preprocess("Karam is eating Pizza")

'Karam eat Pizza'

In [10]:
corpus = [
    "Thor ate pizza",
    "Loki is tall",
    "Loki is eating pizza"
]

In [11]:
corpus_processed = [ preprocess(text) for text in corpus]

In [12]:
corpus_processed

['thor eat pizza', 'Loki tall', 'Loki eat pizza']

In [13]:
count_Vector= CountVectorizer(ngram_range=(1,2))
count_Vector.fit(corpus_processed)
count_Vector.vocabulary_

{'thor': 7,
 'eat': 0,
 'pizza': 5,
 'thor eat': 8,
 'eat pizza': 1,
 'loki': 2,
 'tall': 6,
 'loki tall': 4,
 'loki eat': 3}

In [14]:
count_Vector.transform(['Thor eat pizza']).toarray()

array([[1, 1, 0, 0, 0, 1, 0, 1, 1]], dtype=int64)

In [15]:
count_Vector.transform(['Hulk eat pizza']).toarray()

array([[1, 1, 0, 0, 0, 1, 0, 0, 0]], dtype=int64)

In [17]:
######################################

### Load Dataset New Category Classifier >>  json

In [21]:
df = pd.read_json("data/news_dataset.json")
#print shape dataset
print(df.shape)

(12695, 2)


In [23]:
#print head data 
df.head()

Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME


In [27]:
df.category.value_counts()

BUSINESS    4254
SPORTS      4167
CRIME       2893
SCIENCE     1381
Name: category, dtype: int64

In [25]:
#will use undersampling to solve problem imbalanced

In [29]:
min_sample=df.category.value_counts().min()
min_sample

1381

In [35]:
df_sport= df[df.category=='SPORTS'].sample(min_sample , random_state=2001)
df_business= df[df.category=='BUSINESS'].sample(min_sample , random_state=2001)
df_crime= df[df.category=='CRIME'].sample(min_sample , random_state=2001)
df_science= df[df.category=='SCIENCE'].sample(min_sample , random_state=2001)

In [36]:
df_new_balance=pd.concat([df_business ,df_sport,df_crime , df_science] , axis=0)

In [41]:
df_new_balance.shape

(5524, 2)

In [39]:
df_new_balance

Unnamed: 0,text,category
5611,The 50 Best Quotes From The 2015 Cannes Lions ...,BUSINESS
9323,Young and Entrepreneurial: How College Entrepr...,BUSINESS
11355,Macy's Announces Thousands Of Job Cuts The ret...,BUSINESS
6092,"Women in Business Q&A: Christine Wheeler, Foun...",BUSINESS
10188,What Does the iPhone SE Mean for Mobile Advert...,BUSINESS
...,...,...
7113,The Fastest Way To Chill Your Beer Happy drink...,SCIENCE
9022,8 Fascinating Things We Learned About The Mind...,SCIENCE
6473,New Study Examining Kids' Alcohol Consumption ...,SCIENCE
115,Ancient Earth's 'Face-Lift' Detailed In New Mo...,SCIENCE


In [42]:
df_new_balance=df_new_balance.sample(df_new_balance.shape[0])

In [43]:
df_new_balance

Unnamed: 0,text,category
741,It's Time to Declare Your Personal Independenc...,BUSINESS
2337,11 Essentials You Need To Host The Perfect Tai...,SPORTS
5614,NYPD Union Leader Lashes Out At Mayor Again,CRIME
12127,Former Gymnastics Coach Tells Larry Nassar To ...,SPORTS
1453,LeBron Reaches Out To Sick Teen: 'Together We ...,SPORTS
...,...,...
12322,4 Technology Trends for Small Business While s...,BUSINESS
8408,"Simone Biles Isn't The Next Anyone, She's 'The...",SPORTS
12487,The Reconnection Agenda: The Fun and Easy Rout...,BUSINESS
8463,Tomb Of Lost Egyptian Queen Discovered,SCIENCE


In [48]:
#encoding target
target = {'BUSINESS': 0, 'SPORTS': 1, 'CRIME': 2, 'SCIENCE': 3}

df_new_balance['category_num']= df_new_balance['category'].map(target)

##### Split data into train and test 

In [52]:
X_train , X_test , y_train , y_test= train_test_split(df_new_balance.text , 
                                                      df_new_balance.category_num , 
                                                     test_size=0.2
                                                      , random_state=2001 ,
                                                     stratify=df_new_balance.category_num)

In [53]:
print(X_train.shape)

(4419,)


In [57]:
print(y_train.value_counts())

2    1105
1    1105
3    1105
0    1104
Name: category_num, dtype: int64


In [58]:
print(y_test.value_counts())

0    277
3    276
1    276
2    276
Name: category_num, dtype: int64


### Will use Pipline 

In [61]:
pipline_model= Pipeline([
    ("count_vector",CountVectorizer()),
    ("Multinomial",MultinomialNB())
])

In [62]:
pipline_model.fit(X_train ,y_train)

Pipeline(steps=[('count_vector', CountVectorizer()),
                ('Multinomial', MultinomialNB())])

In [63]:
y_pred=pipline_model.predict(X_test)
y_pred

array([3, 2, 0, ..., 0, 3, 2], dtype=int64)

In [65]:
#print classificatopn report
print(classification_report(y_test , y_pred))

              precision    recall  f1-score   support

           0       0.80      0.91      0.85       277
           1       0.91      0.82      0.86       276
           2       0.87      0.91      0.89       276
           3       0.92      0.84      0.88       276

    accuracy                           0.87      1105
   macro avg       0.88      0.87      0.87      1105
weighted avg       0.88      0.87      0.87      1105

