FEATURES: title, preprocessed content, length_preprocessed_content

MODEL: Logistic Regression only (ignored rows having only one class to be predicted)

In [1]:
import psycopg2
import pandas as pd
import json
import pandas.io.sql as sqlio
# preprocess
from nltk.corpus import stopwords
import re

In [27]:
%config InlineBackend.figure_format = 'retina'
from pathlib import Path
import time
import pickle
from contextlib import contextmanager
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.base import BaseEstimator
@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.0f} s')

In [3]:
train_data=pd.read_csv("train_data3.csv")
train_data.head()

Unnamed: 0,index,id,content,title,preprocessed_content,label,hierarchy,hierarchy_#tiers,tier1,tier2,tier3,tier4
0,133,https://www.whowhatwear.com/5-looks-that-put-b...,After seeing burgundy boots on some of our fav...,5 Looks That Put Burgundy Boots On Our Must-Ha...,seeing burgundy boot favorite street style blo...,Womens\ Casual\ Wear,"[{'Style & Fashion': {""Women's Fashion"": {""Wom...",4,Style & Fashion,Womens Fashion,Womens Clothing,Womens Casual Wear
1,189,https://www.wisebread.com/what-you-need-to-kno...,"According to a September 29, 2009 Fidelity Inv...",How to Make the Most of Your 401K,according september fidelity investments study...,Personal\ Investing,[{'Personal Finance': 'Personal Investing'}],2,Personal Finance,Personal Investing,,
2,401,https://www.theturekclinic.com/blog/mindful-ea...,"“Thou shouldst eat to live; not live to eat,” ...",Mindful Eating,thou shouldst eat live live eat said socrates ...,Mens\ Health,"[{'Healthy Living': ""Men's Health""}]",2,Healthy Living,Mens Health,,
3,198,https://www.news-medical.net/news/20190611/Ear...,Researchers continue to dig for molecular clue...,Early life adversity and high levels of FKBP5 ...,researchers continue dig molecular clue better...,Hormonal\ Disorders,[{'Medical Health': {'Diseases and Conditions'...,4,Medical Health,Diseases and Conditions,Endocrine and Metabolic Diseases,Hormonal Disorders
4,24,https://hobbylark.com/card-games/Top-Strongest...,"As any duelist knows, the vast majority of Yu-...",Top 10 Strongest (Highest ATK) Monsters in Yu-...,duelist know vast majority yu gi oh match end ...,Card\ Games,[{'Hobbies & Interests': {'Games and Puzzles':...,3,Hobbies & Interests,Games and Puzzles,Card Games,


In [4]:
train_data['tier1'].value_counts()

Sports                      927
Business and Finance        900
Technology & Computing      700
Style & Fashion             644
Personal Finance            600
Travel                      578
Hobbies & Interests         521
Medical Health              500
Education                   490
Books and Literature        365
Pop Culture                 300
Music and Audio             300
Automotive                  300
Movies                      279
Events and Attractions      271
Family and Relationships    200
Food & Drink                200
Science                     200
Healthy Living              200
Real Estate                 200
Religion & Spirituality     100
Pets                        100
Careers                     100
News and Politics           100
Shopping                     92
Name: tier1, dtype: int64

In [5]:
train_data['tier2'].value_counts()

Computing                     700
Travel Type                   478
Womens Fashion                444
Industries                    400
Business                      300
                             ... 
Family and Children Movies     61
Model Toys                     59
Crime and Mystery Movies       59
Extreme Sports                 20
Golf                           17
Name: tier2, Length: 63, dtype: int64

In [6]:
train_data['tier3'].value_counts()

Internet                              400
Womens Clothing                       244
Endocrine and Metabolic Diseases      200
Motor Insurance                       200
Rail Travel                           150
Mutual Funds                          100
Housing Market                        100
Laptops                               100
Gasoline Prices                       100
Credit Cards                          100
Womens Accessories                    100
Standardized Testing                  100
Consumer Issues                       100
Womens Shoes and Footwear             100
Family Travel                         100
Mens Clothing                         100
Business Accounting & Finance         100
Classic Cars                          100
Apparel Industry                      100
Computer Peripherals                  100
Van                                   100
Computer Software and Applications    100
Publishing Industry                   100
Defense Industry                  

In [7]:
train_data['tier4'].value_counts()

Womens Casual Wear            200
Menopause                     100
Web Conferencing              100
Shareware and Freeware        100
Mens Business Wear            100
Web Hosting                   100
Infertility                   100
Venture Capital               100
Womens Jewelry and Watches    100
IT and Internet Support       100
Hormonal Disorders            100
Internet of Things            100
Recalls                       100
Womens Sportswear              44
Name: tier4, dtype: int64

In [8]:

tf_idf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', ngram_range=(1, 2))

# multinomial logistic regression 
logit = LogisticRegression(C=1e2, n_jobs=4, solver='lbfgs', 
                           random_state=42, verbose=0, 
                           multi_class='multinomial')



In [9]:
class Hierarchical(BaseEstimator):
    
    def __init__(self, 
                 base_model, 
                 model_store_path,
                 class_separator = '/',
                 min_size_to_train=50
                ):

        self.base_model = base_model
        self.model_store_path = Path(model_store_path)
        self.class_separator = class_separator
        self.min_size_to_train = min_size_to_train
        
        self.model_store_path.mkdir(exist_ok=True)
        
    def fit(self, X, y):
        
        lev1_classes = [label.split(self.class_separator)[0]
                        for label in y]
        
        with timer('Training level 1 model'):
            self.base_model.fit(X, lev1_classes)
            
            
            with open(self.model_store_path / 'level1_model.pkl', 'wb') as f:
                pickle.dump(self.base_model, f)
        
        
        for lev1_class in np.unique(lev1_classes):
            with timer(f'Training level 2 model for parent: {lev1_class}'):
                curr_X = X.loc[y.str.startswith(lev1_class)]
                curr_y = y.loc[y.str.startswith(lev1_class)].apply(lambda s: s.split(self.class_separator)[1]).str.strip()
                curr_y=curr_y[curr_y!=""]
                curr_X=X.loc[curr_y.index]
                if len(curr_X) < self.min_size_to_train:
                    print(f"Skipped class {lev1_class.replace(' ', '_')} as less data for further classification: {len(curr_X)}")
                    continue
                model_name = f"level2_model_{lev1_class.replace(' ', '_')}.pkl"
                try:
                    self.base_model.fit(curr_X, curr_y)                                  
                    with open(self.model_store_path / model_name, 'wb') as f:
                        pickle.dump(self.base_model, f)
                except ValueError:
                    #when no. of classes=1 within the main class
                    print(np.unique(curr_y),"in", lev1_class)
                    continue

        lev2_classes = [label.split(self.class_separator)[1]
                        for label in y]
        
        for lev2_class in np.unique(lev2_classes):
            if lev2_class=="":
                continue
            with timer(f'Training level 3 model for parent: {lev2_class}'):
                curr_X = X.loc[y.str.contains(lev2_class)]
                curr_y = y.loc[y.str.contains(lev2_class)].apply(lambda s: s.split(self.class_separator)[2]).str.strip()
                curr_y=curr_y[curr_y!=""]
                curr_X=X.loc[curr_y.index]
                if len(curr_X) < self.min_size_to_train:
                    print(f"Skipped class {lev2_class.replace(' ', '_')} as less data for further classification: {len(curr_X)}")
                    continue
                model_name = f"level3_model_{lev2_class.replace(' ', '_')}.pkl"
                try:
                    self.base_model.fit(curr_X, curr_y)                                  
                    with open(self.model_store_path / model_name, 'wb') as f:
                        pickle.dump(self.base_model, f)
                except ValueError:
                    print(np.unique(curr_y),"in", lev2_class)
                    continue

        lev3_classes = [label.split(self.class_separator)[2]
                        for label in y]
        
        for lev3_class in np.unique(lev3_classes):
            if lev3_class=="":
                continue
            with timer(f'Training level 4 model for parent: {lev3_class}'):
                curr_X = X.loc[y.str.contains(lev3_class)]
                curr_y = y.loc[y.str.contains(lev3_class)].apply(lambda s: s.split(self.class_separator)[3]).str.strip()
                curr_y=curr_y[curr_y!=""]
                curr_X=X.loc[curr_y.index]
                if len(curr_X) < self.min_size_to_train:
                    print(f"Skipped class {lev2_class.replace(' ', '_')} as less data for further classification: {len(curr_X)}")
                    continue
                model_name = f"level4_model_{lev3_class.replace(' ', '_')}.pkl"
                try:
                    self.base_model.fit(curr_X, curr_y)                                  
                    with open(self.model_store_path / model_name, 'wb') as f:
                        pickle.dump(self.base_model, f)
                except ValueError:
                    print(np.unique(curr_y),"in", lev3_class)
                    continue

    def predict(self, X):
        
        model_name =  'level1_model.pkl'
        with open(self.model_store_path / model_name, 'rb') as f:
            level1_model = pickle.load(f)
        
        level1_preds = level1_model.predict(X)
            
        level2_preds = np.zeros_like(level1_preds)
        level2_preds = np.array(level2_preds, dtype='object') 
            
        for lev1_class in np.unique(level1_preds):
            idx = level1_preds == lev1_class
            curr_X = X.iloc[idx]
            
            model_name = f"level2_model_{lev1_class.replace(' ', '_')}.pkl"
            
            if Path(self.model_store_path / model_name).exists():
            
                with open(self.model_store_path / model_name, 'rb') as f:
                    level2_model = pickle.load(f)

                curr_level2_preds = level2_model.predict(curr_X)
                level2_preds[idx] = curr_level2_preds
            
            else:
                level2_preds[idx] = lev1_class
        
        level3_preds = np.zeros_like(level2_preds)
        level3_preds = np.array(level3_preds, dtype='object') 
        level2_preds=level2_preds.astype(str)
        for lev2_class in np.unique(level2_preds):
            idx = level2_preds == lev2_class
            curr_X = X.iloc[idx]       
            if lev2_class=="":
                level3_preds[idx]=""
                continue
            model_name = f"level3_model_{lev2_class.replace(' ', '_')}.pkl"
            if Path(self.model_store_path / model_name).exists():
                with open(self.model_store_path / model_name, 'rb') as f:
                    level3_model = pickle.load(f)

                curr_level3_preds = level3_model.predict(curr_X)
                level3_preds[idx] = curr_level3_preds
            
            else:
                level3_preds[idx] = lev2_class
            
        level4_preds = np.zeros_like(level3_preds)
        level4_preds = np.array(level4_preds, dtype='object') 
        level3_preds=level3_preds.astype(str)   
        for lev3_class in np.unique(level3_preds):
            idx = level3_preds == lev3_class
            curr_X = X.iloc[idx]       
            if lev3_class=="":
                level4_preds[idx]=""
                continue
            model_name = f"level4_model_{lev3_class.replace(' ', '_')}.pkl"
            if Path(self.model_store_path / model_name).exists():
                with open(self.model_store_path / model_name, 'rb') as f:
                    level4_model = pickle.load(f)

                curr_level4_preds = level4_model.predict(curr_X)
                level4_preds[idx] = curr_level4_preds
            
            else:
                level4_preds[idx] = lev3_class
        
                
        return level1_preds, level2_preds,level3_preds,level4_preds

feature exploration

In [10]:
train_data.columns

Index(['index', 'id', 'content', 'title', 'preprocessed_content', 'label',
       'hierarchy', 'hierarchy_#tiers', 'tier1', 'tier2', 'tier3', 'tier4'],
      dtype='object')

In [11]:
train_data=train_data.drop(['index'],axis=1)

In [12]:
pd.isna(train_data).sum()

id                         0
content                  158
title                    222
preprocessed_content     175
label                      0
hierarchy                  0
hierarchy_#tiers           0
tier1                      0
tier2                    492
tier3                   5091
tier4                   7723
dtype: int64

In [13]:
def length(x):
    if not pd.isna(x):
        return (len(x.split(" ")))
train_data['length_preprocessed_content']=train_data['preprocessed_content'].apply(length)

In [14]:
train_data['length_preprocessed_content'].describe()

count    8992.000000
mean      433.420262
std       400.555787
min         1.000000
25%       177.000000
50%       334.000000
75%       568.000000
max      6528.000000
Name: length_preprocessed_content, dtype: float64

new training and test data

In [15]:
len(train_data)

9167

In [16]:
train_data=train_data.fillna('')

In [17]:
train_data['tier1']=train_data['tier1'].str.strip()
train_data['tier2']=train_data['tier2'].str.strip()
train_data['tier3']=train_data['tier3'].str.strip()
train_data['tier4']=train_data['tier4'].str.strip()

In [18]:
train_data['tier1_tier2_tier3_tier4'] = train_data['tier1']+ '/' + train_data['tier2']+ '/' + train_data['tier3']+'/' + train_data['tier4']
train_data.head()

Unnamed: 0,id,content,title,preprocessed_content,label,hierarchy,hierarchy_#tiers,tier1,tier2,tier3,tier4,length_preprocessed_content,tier1_tier2_tier3_tier4
0,https://www.whowhatwear.com/5-looks-that-put-b...,After seeing burgundy boots on some of our fav...,5 Looks That Put Burgundy Boots On Our Must-Ha...,seeing burgundy boot favorite street style blo...,Womens\ Casual\ Wear,"[{'Style & Fashion': {""Women's Fashion"": {""Wom...",4,Style & Fashion,Womens Fashion,Womens Clothing,Womens Casual Wear,31,Style & Fashion/Womens Fashion/Womens Clothing...
1,https://www.wisebread.com/what-you-need-to-kno...,"According to a September 29, 2009 Fidelity Inv...",How to Make the Most of Your 401K,according september fidelity investments study...,Personal\ Investing,[{'Personal Finance': 'Personal Investing'}],2,Personal Finance,Personal Investing,,,778,Personal Finance/Personal Investing//
2,https://www.theturekclinic.com/blog/mindful-ea...,"“Thou shouldst eat to live; not live to eat,” ...",Mindful Eating,thou shouldst eat live live eat said socrates ...,Mens\ Health,"[{'Healthy Living': ""Men's Health""}]",2,Healthy Living,Mens Health,,,363,Healthy Living/Mens Health//
3,https://www.news-medical.net/news/20190611/Ear...,Researchers continue to dig for molecular clue...,Early life adversity and high levels of FKBP5 ...,researchers continue dig molecular clue better...,Hormonal\ Disorders,[{'Medical Health': {'Diseases and Conditions'...,4,Medical Health,Diseases and Conditions,Endocrine and Metabolic Diseases,Hormonal Disorders,356,Medical Health/Diseases and Conditions/Endocri...
4,https://hobbylark.com/card-games/Top-Strongest...,"As any duelist knows, the vast majority of Yu-...",Top 10 Strongest (Highest ATK) Monsters in Yu-...,duelist know vast majority yu gi oh match end ...,Card\ Games,[{'Hobbies & Interests': {'Games and Puzzles':...,3,Hobbies & Interests,Games and Puzzles,Card Games,,63,Hobbies & Interests/Games and Puzzles/Card Games/


In [19]:
train_data['tier1_tier2_tier3_tier4'].value_counts().head()

Sports/Cricket//                                                     200
Style & Fashion/Womens Fashion/Womens Clothing/Womens Casual Wear    200
Education/Homework and Study//                                       200
Personal Finance/Insurance/Motor Insurance/                          200
Travel/Travel Type/Rail Travel/                                      150
Name: tier1_tier2_tier3_tier4, dtype: int64

In [20]:
r1=train_data[train_data['tier3'].str.contains("'s")]['tier3'].index
train_data.loc[r1,'tier3']=train_data.loc[r1,'tier3'].str.replace("'s","s")

r2=train_data[train_data['tier2'].str.contains("'s")]['tier2'].index
train_data.loc[r2,'tier2']=train_data.loc[r2,'tier2'].str.replace("'s","s")

train_data['tier1']=train_data['tier1'].str.strip()
train_data['tier2']=train_data['tier2'].str.strip()
train_data['tier3']=train_data['tier3'].str.strip()
train_data['tier4']=train_data['tier4'].str.strip()

train_data['tier2']=train_data['tier2'].str.strip("\"")
train_data['tier3']=train_data['tier3'].str.strip("\"")
train_data['tier4']=train_data['tier4'].str.strip("\"")

In [21]:
train_data.columns

Index(['id', 'content', 'title', 'preprocessed_content', 'label', 'hierarchy',
       'hierarchy_#tiers', 'tier1', 'tier2', 'tier3', 'tier4',
       'length_preprocessed_content', 'tier1_tier2_tier3_tier4'],
      dtype='object')

In [22]:
ind=train_data[train_data['title']==""].index
ind1=train_data[train_data['preprocessed_content']==""].index

In [23]:
train_data=train_data[~(train_data.index.isin(ind))]
train_data=train_data[~(train_data.index.isin(ind1))]
len(train_data)

8774

In [24]:
features1=train_data[['title','preprocessed_content','length_preprocessed_content']]#,'tier1','tier2','tier3','tier4']]
labels1=train_data[['tier1', 'tier2', 'tier3', 'tier4', 'tier1_tier2_tier3_tier4']]
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(features1, labels1,test_size=0.2,random_state=42,stratify=labels1['tier4'])

In [25]:
X_train_new = X_train_new.reset_index(drop=True)
y_train_new = y_train_new.reset_index(drop=True)
X_test_new = X_test_new.reset_index(drop=True)
y_test_new = y_test_new.reset_index(drop=True)

without embedding and other features(length of description and publisher_id)

In [28]:
from category_encoders import TargetEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
combine = make_column_transformer((tf_idf,'title'),(tf_idf,'preprocessed_content'),(StandardScaler(),['length_preprocessed_content']),remainder='passthrough')

In [31]:
import numpy as np
base_model = Pipeline([('combine', combine), 
                       ('logit', logit)])

model =Hierarchical(
    base_model=base_model,
    model_store_path='modelsnew1'
)

model.fit(X_train_new,y_train_new['tier1_tier2_tier3_tier4'])

[Training level 1 model] done in 115 s
[Training level 2 model for parent: Automotive] done in 2 s
[Training level 2 model for parent: Books and Literature] done in 3 s
[Training level 2 model for parent: Business and Finance] done in 5 s
['Career Advice'] in Careers
[Training level 2 model for parent: Careers] done in 0 s
[Training level 2 model for parent: Education] done in 3 s
[Training level 2 model for parent: Events and Attractions] done in 0 s
['Parenting'] in Family and Relationships
[Training level 2 model for parent: Family and Relationships] done in 0 s
['Desserts and Baking'] in Food & Drink
[Training level 2 model for parent: Food & Drink] done in 0 s
[Training level 2 model for parent: Healthy Living] done in 1 s
[Training level 2 model for parent: Hobbies & Interests] done in 3 s
[Training level 2 model for parent: Medical Health] done in 2 s
[Training level 2 model for parent: Movies] done in 3 s
[Training level 2 model for parent: Music and Audio] done in 2 s
['Disast

In [32]:
level1_pred, level2_pred, level3_pred, level4_pred= model.predict(X_test_new)

In [33]:
oneclass=['Career Advice','Parenting','Desserts and Baking','Disasters','Sikhism','Womens Fashion','Computing','Van','Classic Cars','Audio Production','Standardized Testing','Motor Insurance','Adoption and Fostering','Credit Cards','Mutual Funds','Africa Travel','Venture Capital','Shareware and Freeware','Recalls','Infertility','Womens Jewelry and Watches']

In [34]:
finalresult=pd.concat([X_test_new,y_test_new['tier1_tier2_tier3_tier4'],y_test_new['tier1'],y_test_new['tier2'],y_test_new['tier3'],y_test_new['tier4'],pd.DataFrame(level1_pred,columns=['pred1']),pd.DataFrame(level2_pred,columns=['pred2']),pd.DataFrame(level3_pred,columns=['pred3']),pd.DataFrame(level4_pred,columns=['pred4'])],axis=1)

In [35]:
finalresult

Unnamed: 0,title,preprocessed_content,length_preprocessed_content,tier1_tier2_tier3_tier4,tier1,tier2,tier3,tier4,pred1,pred2,pred3,pred4
0,Menopause is not the end of women&#39;s love life,menopause signal end woman love life fact woma...,225,Medical Health/Diseases and Conditions/Endocri...,Medical Health,Diseases and Conditions,Endocrine and Metabolic Diseases,Menopause,Medical Health,Diseases and Conditions,Endocrine and Metabolic Diseases,Menopause
1,BREAKING: Tomorrowland Winter 2020 Is Official...,french government shutting tomorrowland winter...,125,Music and Audio/Dance and Electronic Music//,Music and Audio,Dance and Electronic Music,,,Music and Audio,Dance and Electronic Music,Dance and Electronic Music,Dance and Electronic Music
2,Credit Cards Stoke JPMorgan Comeback,credit card well jpm jpmorgan chase one leadin...,490,Personal Finance/Personal Debt/Credit Cards/,Personal Finance,Personal Debt,Credit Cards,,Personal Finance,Personal Debt,Personal Debt,Personal Debt
3,1993 Nissan President Is The RWD Flagship You ...,japanese domestic market jdm car bee knee dist...,156,Automotive/Auto Type/Classic Cars/,Automotive,Auto Type,Classic Cars,,Automotive,Auto Type,Auto Type,Auto Type
4,Immaculate Car Collection In Mexico Is Eclecti...,cool car universal language passion collecting...,204,Automotive/Auto Type/Classic Cars/,Automotive,Auto Type,Classic Cars,,Automotive,Auto Type,Auto Type,Auto Type
...,...,...,...,...,...,...,...,...,...,...,...,...
1750,How Georgia Satellites' 'Keep Your Hands to Yo...,thirty year ago georgia satellites released on...,884,Music and Audio/Country Music//,Music and Audio,Country Music,,,Music and Audio,Country Music,Country Music,Country Music
1751,The Farm Bill works for birds — and needs your...,farmland devoid bird certain part farmer land ...,420,Hobbies & Interests/Birdwatching//,Hobbies & Interests,Birdwatching,,,Hobbies & Interests,Birdwatching,Birdwatching,Birdwatching
1752,Adoption Champions,adoption become widely discussed supported eve...,307,Family and Relationships/Parenting/Adoption an...,Family and Relationships,Parenting,Adoption and Fostering,,Family and Relationships,Family and Relationships,Family and Relationships,Family and Relationships
1753,Poster poems: seeking refuge and asylum,channel tunnel besieged dispossessed mediterra...,378,Books and Literature/Poetry//,Books and Literature,Poetry,,,Books and Literature,Poetry,Poetry,Poetry


In [36]:
finalresult[finalresult['tier1']!=finalresult['pred1']]

Unnamed: 0,title,preprocessed_content,length_preprocessed_content,tier1_tier2_tier3_tier4,tier1,tier2,tier3,tier4,pred1,pred2,pred3,pred4
12,Over 1.5K Morgans Gathered To Celebrate 108 Ye...,morgan bespoke british automaker still us wood...,194,Automotive/Auto Type/Classic Cars/,Automotive,Auto Type,Classic Cars,,Events and Attractions,Amusement and Theme Parks,Amusement and Theme Parks,Amusement and Theme Parks
19,Commonwealth Biotechnologies to acquire GL Bio...,commonwealth biotechnologies inc cbi nasdaq ca...,364,Medical Health/Pharmaceutical Drugs//,Medical Health,Pharmaceutical Drugs,,,Business and Finance,Industries,Pharmaceutical Industry,Pharmaceutical Industry
21,Getting Into The Unicorn Boom: 10 Mutual Funds...,pure storage conducted ipo early october billi...,1605,Personal Finance/Personal Investing/Mutual Funds/,Personal Finance,Personal Investing,Mutual Funds,,Business and Finance,Industries,Apparel Industry,Apparel Industry
46,MediSapiens launches unified cloud computing a...,medisapiens launch first kind fully integrated...,245,Medical Health/Pharmaceutical Drugs//,Medical Health,Pharmaceutical Drugs,,,Technology & Computing,Technology & Computing,Technology & Computing,Technology & Computing
71,IBM Quantum Computer Does Record-Breaking Chem...,cool sound quantum computer probably best suit...,377,Science/Physics//,Science,Physics,,,Technology & Computing,Technology & Computing,Technology & Computing,Technology & Computing
...,...,...,...,...,...,...,...,...,...,...,...,...
1715,Port Aransas Chefs Finish Top Ten in Their Cat...,port aransas texas nov prnewswire port aransas...,287,Events and Attractions/Amusement and Theme Par...,Events and Attractions,Amusement and Theme Parks,,,Sports,Bowling,Bowling,Bowling
1716,U.S. Department of Labor Issues Final Joint Em...,washington u department labor dol revealed fin...,237,Real Estate/Hotel Properties//,Real Estate,Hotel Properties,,,Business and Finance,Business,Business Accounting & Finance,Business Accounting & Finance
1736,PharmSource ranks Patheon as top global contra...,ranking based independent pharmsource r resear...,234,Medical Health/Pharmaceutical Drugs//,Medical Health,Pharmaceutical Drugs,,,Business and Finance,Industries,Pharmaceutical Industry,Pharmaceutical Industry
1744,"Cloud Flash, Standard Tests And Storage Versus...",enterprise data volume doubling every two year...,962,Education/Educational Assessment/Standardized ...,Education,Educational Assessment,Standardized Testing,,Technology & Computing,Technology & Computing,Technology & Computing,Technology & Computing


In [37]:
print("Accuracy of level 1",1-len(finalresult[finalresult['tier1']!=finalresult['pred1']])/len(finalresult))

Accuracy of level 1 0.9133903133903134


# Accuracy of level 1 
<h2><span style="color:red">0.9133903133903134</span></h2>


In [38]:
finalresult[(finalresult['tier2']!=finalresult['pred2'])&(finalresult['tier2']!="")]

Unnamed: 0,title,preprocessed_content,length_preprocessed_content,tier1_tier2_tier3_tier4,tier1,tier2,tier3,tier4,pred1,pred2,pred3,pred4
5,How partnerships enriched the learning for Nai...,eight year research low income neighbourhood n...,552,Education/Homework and Study//,Education,Homework and Study,,,Education,Primary Education,Primary Education,Primary Education
6,New ASUS ExpertBook A Solid Work Laptop,asus expertbook b9450 first thing notice new a...,608,Technology & Computing/Computing/Laptops/,Technology & Computing,Computing,Laptops,,Technology & Computing,Technology & Computing,Technology & Computing,Technology & Computing
7,The Normalization of Mean Leaders: A Recipe fo...,many u wondering glory day kalanicks shkrelis ...,824,Business and Finance/Business/Business Banking...,Business and Finance,Business,Business Banking & Finance,Venture Capital,Business and Finance,Industries,Apparel Industry,Apparel Industry
8,Are Desktop PCs And Laptops Becoming Obsolete?,desktop pcs laptop rendered obsolete originall...,524,Technology & Computing/Computing/Laptops/,Technology & Computing,Computing,Laptops,,Technology & Computing,Technology & Computing,Technology & Computing,Technology & Computing
9,"Video: Don't freak, I'm Sikh",turban reminds good person say pardeep singh v...,78,Religion & Spirituality/Sikhism//,Religion & Spirituality,Sikhism,,,Religion & Spirituality,Religion & Spirituality,Religion & Spirituality,Religion & Spirituality
...,...,...,...,...,...,...,...,...,...,...,...,...
1741,Facebook Open Sources New Compression And Stor...,announced open sourcing handful infrastructure...,620,Technology & Computing/Computing/Computer Soft...,Technology & Computing,Computing,Computer Software and Applications,Shareware and Freeware,Technology & Computing,Technology & Computing,Technology & Computing,Technology & Computing
1744,"Cloud Flash, Standard Tests And Storage Versus...",enterprise data volume doubling every two year...,962,Education/Educational Assessment/Standardized ...,Education,Educational Assessment,Standardized Testing,,Technology & Computing,Technology & Computing,Technology & Computing,Technology & Computing
1748,The top companies hiring in June 2016,michael chearyso british summer time finally c...,540,Careers/Career Advice//,Careers,Career Advice,,,Careers,Careers,Careers,Careers
1749,XAG Robot Joins Drone Fleet to Initiate Ground...,guangzhou china feb prnewswire since emergence...,686,Business and Finance/Industries/Defense Industry/,Business and Finance,Industries,Defense Industry,,Technology & Computing,Technology & Computing,Technology & Computing,Technology & Computing


In [39]:
print("Accuracy of level 2",1-len(finalresult[(finalresult['tier2']!=finalresult['pred2'])&(finalresult['tier2']!="")])/len(finalresult))

Accuracy of level 2 0.6957264957264957


 # Accuracy of level 2 
<h2><span style="color:red">0.6957264957264957</span></h2>


In [40]:
finalresult[(finalresult['tier3']!=finalresult['pred3'])&(finalresult['tier3']!="")]

Unnamed: 0,title,preprocessed_content,length_preprocessed_content,tier1_tier2_tier3_tier4,tier1,tier2,tier3,tier4,pred1,pred2,pred3,pred4
2,Credit Cards Stoke JPMorgan Comeback,credit card well jpm jpmorgan chase one leadin...,490,Personal Finance/Personal Debt/Credit Cards/,Personal Finance,Personal Debt,Credit Cards,,Personal Finance,Personal Debt,Personal Debt,Personal Debt
3,1993 Nissan President Is The RWD Flagship You ...,japanese domestic market jdm car bee knee dist...,156,Automotive/Auto Type/Classic Cars/,Automotive,Auto Type,Classic Cars,,Automotive,Auto Type,Auto Type,Auto Type
4,Immaculate Car Collection In Mexico Is Eclecti...,cool car universal language passion collecting...,204,Automotive/Auto Type/Classic Cars/,Automotive,Auto Type,Classic Cars,,Automotive,Auto Type,Auto Type,Auto Type
6,New ASUS ExpertBook A Solid Work Laptop,asus expertbook b9450 first thing notice new a...,608,Technology & Computing/Computing/Laptops/,Technology & Computing,Computing,Laptops,,Technology & Computing,Technology & Computing,Technology & Computing,Technology & Computing
7,The Normalization of Mean Leaders: A Recipe fo...,many u wondering glory day kalanicks shkrelis ...,824,Business and Finance/Business/Business Banking...,Business and Finance,Business,Business Banking & Finance,Venture Capital,Business and Finance,Industries,Apparel Industry,Apparel Industry
...,...,...,...,...,...,...,...,...,...,...,...,...
1739,Durbanville restaurant outdoes Test Kitchen,test kitchen la colombe myoga jardine hartford...,195,Travel/Travel Locations/Africa Travel/,Travel,Travel Locations,Africa Travel,,Travel,Travel Locations,Travel Locations,Travel Locations
1741,Facebook Open Sources New Compression And Stor...,announced open sourcing handful infrastructure...,620,Technology & Computing/Computing/Computer Soft...,Technology & Computing,Computing,Computer Software and Applications,Shareware and Freeware,Technology & Computing,Technology & Computing,Technology & Computing,Technology & Computing
1744,"Cloud Flash, Standard Tests And Storage Versus...",enterprise data volume doubling every two year...,962,Education/Educational Assessment/Standardized ...,Education,Educational Assessment,Standardized Testing,,Technology & Computing,Technology & Computing,Technology & Computing,Technology & Computing
1749,XAG Robot Joins Drone Fleet to Initiate Ground...,guangzhou china feb prnewswire since emergence...,686,Business and Finance/Industries/Defense Industry/,Business and Finance,Industries,Defense Industry,,Technology & Computing,Technology & Computing,Technology & Computing,Technology & Computing


In [41]:
print("Accuracy of level 3",1-len(finalresult[(finalresult['tier3']!=finalresult['pred3'])&(finalresult['tier3']!="")])/len(finalresult))

Accuracy of level 3 0.7230769230769231


# Accuracy of level 3 
<h2><span style="color:red">0.7230769230769231</span></h2>


In [42]:
finalresult[(finalresult['tier4']!=finalresult['pred4'])&(finalresult['tier4']!="")]

Unnamed: 0,title,preprocessed_content,length_preprocessed_content,tier1_tier2_tier3_tier4,tier1,tier2,tier3,tier4,pred1,pred2,pred3,pred4
7,The Normalization of Mean Leaders: A Recipe fo...,many u wondering glory day kalanicks shkrelis ...,824,Business and Finance/Business/Business Banking...,Business and Finance,Business,Business Banking & Finance,Venture Capital,Business and Finance,Industries,Apparel Industry,Apparel Industry
32,Mindtree von ISG zu einem Marktführer für Digi...,unternehmen f r einschneidendes design thinkin...,701,Technology & Computing/Computing/Internet/IT a...,Technology & Computing,Computing,Internet,IT and Internet Support,Technology & Computing,Technology & Computing,Technology & Computing,Technology & Computing
34,Breast cancer medication letrozole could incre...,medicine used breast cancer treatment consider...,344,Medical Health/Diseases and Conditions/Reprodu...,Medical Health,Diseases and Conditions,Reproductive Health,Infertility,Medical Health,Diseases and Conditions,Reproductive Health,Reproductive Health
41,7 Comfy Vacation Outfit Ideas That Won't Make ...,packing vacation sometimes feel like pick dres...,139,Style & Fashion/Womens Fashion/Womens Clothing...,Style & Fashion,Womens Fashion,Womens Clothing,Womens Casual Wear,Style & Fashion,Style & Fashion,Style & Fashion,Style & Fashion
55,17 Reasons to Finally Invest in a Leather Jack...,every season try beautiful leather jacket look...,80,Style & Fashion/Womens Fashion/Womens Clothing...,Style & Fashion,Womens Fashion,Womens Clothing,Womens Casual Wear,Style & Fashion,Style & Fashion,Style & Fashion,Style & Fashion
...,...,...,...,...,...,...,...,...,...,...,...,...
1707,The J. M. Smucker Company Issues Voluntary Rec...,orrville ohio dec prnewswire j smucker company...,162,Business and Finance/Business/Consumer Issues/...,Business and Finance,Business,Consumer Issues,Recalls,Business and Finance,Business,Consumer Issues,Consumer Issues
1718,Thrillist's Ben Lerer on Success as a Young Trep,year old ben lerer proven sweet spot medium co...,381,Business and Finance/Business/Business Banking...,Business and Finance,Business,Business Banking & Finance,Venture Capital,Business and Finance,Business,Business Banking & Finance,Business Banking & Finance
1737,Why CIOs Need To Think About The Internet Of T...,stunning example provincialism washington post...,808,Technology & Computing/Computing/Internet/Inte...,Technology & Computing,Computing,Internet,Internet of Things,Technology & Computing,Technology & Computing,Technology & Computing,Technology & Computing
1740,Why Venture Capitalists Heart NY,big apple recently outpaced massachusetts fame...,208,Business and Finance/Business/Business Banking...,Business and Finance,Business,Business Banking & Finance,Venture Capital,Business and Finance,Business,Business Banking & Finance,Business Banking & Finance


In [43]:
print("Accuracy of level 4",1-len(finalresult[(finalresult['tier4']!=finalresult['pred4'])&(finalresult['tier4']!="")])/len(finalresult))

Accuracy of level 4 0.8683760683760684


# Accuracy of level 4
<h2><span style="color:red">0.8683760683760684</span></h2>


Total Accuracy: 0.8

# System End-to-End Accuracy
<h2><span style="color:red">0.80</span></h2>
