# **Load Dataset Files**

In [36]:
## Importing the tensorflow version
import tensorflow as tf
print(tf.__version__)
tf.set_random_seed(42)

1.15.0


In [37]:
## Mounting the drive
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
import warnings
warnings.filterwarnings('ignore')

In [39]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer
%matplotlib inline
import nltk
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [0]:
blog=pd.read_csv("/content/gdrive/My Drive/AIML/blogtext.csv")

In [41]:
blog.shape

(681284, 7)

In [42]:
blog.dtypes

id         int64
gender    object
age        int64
topic     object
sign      object
date      object
text      object
dtype: object

In [0]:
corp=blog.head(20000)

In [44]:
corp.shape

(20000, 7)

In [45]:
corp.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


**Preprocess rows of the “text” column**

**a. Remove unwanted characters**

**b. Convert text to lowercase**

**c. Remove unwanted spaces**

**d. Remove stopwords**


In [46]:
nltk.download("stopwords")
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
import re
corp['text']=corp['text'].apply(lambda s: s.lower())
corp['text']=corp['text'].apply(lambda s: re.sub('[^0-9a-z #+_]'," ",s))
corp['text']=corp['text'].apply(lambda s: s.strip())

In [0]:
re_stop = re.compile(r"\b(" + "|".join(stop) + ")\\W", re.I)
def removeStopWords(sentence):
    global re_stop
    return re_stop.sub(" ", sentence)
corp['text'] = corp['text'].apply(removeStopWords)

In [49]:
corp.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004",info found + 100 pages 4 5 mb pdf fil...
1,2059027,male,15,Student,Leo,"13,May,2004",team members drewes van der laag ...
2,2059027,male,15,Student,Leo,"12,May,2004",het kader van kernfusie op aarde maak je ei...
3,2059027,male,15,Student,Leo,"12,May,2004",testing testing
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",thanks yahoo toolbar capture urls po...


In [0]:
corp['age']=corp.age.astype('str')

In [51]:
corp.dtypes

id         int64
gender    object
age       object
topic     object
sign      object
date      object
text      object
dtype: object

As we want to make this into a multi-label classification problem, you are required to merge all the label columns together, so that we have all the labels together for a particular sentence

a. Label columns to merge: “gender”, “age”, “topic”, “sign”

b. After completing the previous step, there should be only two columns in your dataframe i.e. “text” and “labels” as shown in the below image

In [0]:
corp['labels']= corp[['gender','age','topic','sign']].values.tolist()


In [53]:
corp.head()

Unnamed: 0,id,gender,age,topic,sign,date,text,labels
0,2059027,male,15,Student,Leo,"14,May,2004",info found + 100 pages 4 5 mb pdf fil...,"[male, 15, Student, Leo]"
1,2059027,male,15,Student,Leo,"13,May,2004",team members drewes van der laag ...,"[male, 15, Student, Leo]"
2,2059027,male,15,Student,Leo,"12,May,2004",het kader van kernfusie op aarde maak je ei...,"[male, 15, Student, Leo]"
3,2059027,male,15,Student,Leo,"12,May,2004",testing testing,"[male, 15, Student, Leo]"
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",thanks yahoo toolbar capture urls po...,"[male, 33, InvestmentBanking, Aquarius]"


**Separate features and labels, and split the data into training and testing**

In [0]:
X=corp['text']
y=corp['labels']

In [0]:
xtrain,xtest,ytrain,ytest=train_test_split(X,y,test_size=0.30,random_state=100)

In [56]:
print("Train data shape:",xtrain.shape,ytrain.shape)
print("Test Data shape:",xtest.shape,ytest.shape)

Train data shape: (14000,) (14000,)
Test Data shape: (6000,) (6000,)


**Vectorize the features**

**a. Create a Bag of Words using count vectorizer**

**i. Use ngram_range=(1, 2)**

**ii. Vectorize training and testing features**

**b. Print the term-document matrix**

In [0]:
vect = CountVectorizer(ngram_range=(1,2))

In [0]:
X_train_dtm = vect.fit_transform(xtrain)
X_test_dtm = vect.transform(xtest)

In [59]:
print("Train term-document Matrix:",X_train_dtm)
print('\n')
print("Test term-document Matrix:","\n",X_test_dtm)

Train term-document Matrix:   (0, 44474)	1
  (0, 75297)	1
  (0, 751107)	1
  (0, 577776)	1
  (0, 914978)	2
  (0, 44478)	1
  (0, 75361)	1
  (0, 752345)	1
  (0, 580128)	1
  (0, 918413)	1
  (1, 751107)	1
  (1, 979706)	1
  (1, 477466)	1
  (1, 259177)	1
  (1, 721366)	1
  (1, 718607)	1
  (1, 987895)	1
  (1, 755050)	2
  (1, 798583)	1
  (1, 506657)	1
  (1, 485888)	2
  (1, 951013)	2
  (1, 144553)	2
  (1, 132259)	2
  (1, 631234)	1
  :	:
  (13999, 804050)	1
  (13999, 158587)	1
  (13999, 218625)	1
  (13999, 540380)	1
  (13999, 616048)	1
  (13999, 419679)	1
  (13999, 631303)	1
  (13999, 179404)	1
  (13999, 628515)	1
  (13999, 137576)	1
  (13999, 422892)	1
  (13999, 53556)	1
  (13999, 461214)	1
  (13999, 115667)	2
  (13999, 826021)	1
  (13999, 184567)	1
  (13999, 982842)	1
  (13999, 568378)	1
  (13999, 156962)	1
  (13999, 378963)	1
  (13999, 615022)	1
  (13999, 318655)	1
  (13999, 826053)	1
  (13999, 716310)	1
  (13999, 553622)	1


Test term-document Matrix: 
   (0, 371585)	1
  (1, 33450)	1
  (1, 532

**Create a dictionary to get the count of every label i.e. the key will be label name and value willbe the total count of the label.**

In [60]:
corp['id'].nunique()

530

In [0]:
corp_unique=corp.groupby('id').first().reset_index()

In [0]:
corp_unique=corp_unique.drop(['gender','age','topic','sign','date'],axis=1)

In [63]:
corp_unique.head()

Unnamed: 0,id,text,labels
0,23191,twenty something call quarter life cris...,"[female, 23, Advertising, Taurus]"
1,72355,saw show discovery komodo dragon menti...,"[male, 27, indUnk, Leo]"
2,467705,starting anew ditching password protected...,"[female, 25, indUnk, Libra]"
3,468786,hi leave messages fun sharing thing...,"[male, 45, Religion, Aries]"
4,480727,inner conscious tells open others me...,"[male, 23, indUnk, Pisces]"


In [0]:
label_counts=dict()

for labels in corp_unique.labels.values:
  for label in labels:
    if label in label_counts:
      label_counts[label] +=1
    else:
      label_counts[label]=1

In [65]:
label_counts

{'13': 13,
 '14': 35,
 '15': 48,
 '16': 57,
 '17': 59,
 '23': 62,
 '24': 45,
 '25': 51,
 '26': 37,
 '27': 33,
 '33': 22,
 '34': 9,
 '35': 15,
 '36': 7,
 '37': 5,
 '38': 3,
 '39': 5,
 '40': 1,
 '41': 4,
 '42': 4,
 '43': 1,
 '44': 2,
 '45': 5,
 '46': 5,
 '47': 1,
 '48': 1,
 'Accounting': 4,
 'Advertising': 4,
 'Agriculture': 1,
 'Aquarius': 43,
 'Architecture': 2,
 'Aries': 51,
 'Arts': 18,
 'Automotive': 2,
 'Banking': 5,
 'Biotech': 1,
 'BusinessServices': 10,
 'Cancer': 52,
 'Capricorn': 41,
 'Communications-Media': 7,
 'Construction': 1,
 'Consulting': 8,
 'Education': 28,
 'Engineering': 7,
 'Fashion': 2,
 'Gemini': 41,
 'Government': 6,
 'HumanResources': 3,
 'Internet': 15,
 'InvestmentBanking': 2,
 'Law': 5,
 'LawEnforcement-Security': 3,
 'Leo': 46,
 'Libra': 36,
 'Manufacturing': 1,
 'Marketing': 4,
 'Military': 3,
 'Museums-Libraries': 4,
 'Non-Profit': 12,
 'Pisces': 37,
 'Publishing': 3,
 'RealEstate': 2,
 'Religion': 7,
 'Sagittarius': 64,
 'Science': 6,
 'Scorpio': 39,
 'S

**Transform the labels**

**As we have noticed before, in this task each example can have multiple tags. To deal withsuch kind of prediction, we need to transform labels in a binary form and the prediction will be a mask of 0s and 1s.** 

**For this purpose, it is convenient to use MultiLabelBinarizer from sklearn**

**a. Convert your train and test labels using MultiLabelBinarizer**

In [0]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

In [0]:
y_train_dtm=pd.DataFrame(mlb.fit_transform(ytrain),columns=mlb.classes_)

In [0]:
y_test_dtm=pd.DataFrame(mlb.transform(ytest),columns=mlb.classes_)

# **Choose a classifier**

**In this task, we suggest using the One-vs-Rest approach, which is implemented in OneVsRestClassifier class. In this approach k classifiers (= number of tags) are trained. As a basic classifier, use LogisticRegression.**

**It is one of the simplest methods, but often it performs good enough in text classification tasks. It might take some time because the number of classifiers to train is large.**

**a. Use a linear classifier of your choice, wrap it up in OneVsRestClassifier to train it on every label**

**b. As One-vs-Rest approach might not have been discussed in the sessions, we are providing you the code for that**

In [0]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

classifier=LogisticRegression(solver='lbfgs')
classifier=OneVsRestClassifier(classifier)

# **Fit the classifier, make predictions and get the accuracy**

**a. Print the following**

**i. Accuracy score**

**ii. F1 score**

**iii. Average precision score**

**iv. Average recall score**

**v. Tip: Make sure you are familiar with all of them. How would you expect the things to work for the multi-label scenario? Read about micro/macro/weighted averaging**

In [70]:
classifier.fit(X_train_dtm,y_train_dtm)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [0]:
y_train_pred=classifier.predict(X_train_dtm)
y_test_pred=classifier.predict(X_test_dtm)

In [72]:
print("Train Accuracy:",metrics.accuracy_score(y_train_dtm,y_train_pred))
print("Test Accuracy:",metrics.accuracy_score(y_test_dtm,y_test_pred))

Train Accuracy: 0.9367857142857143
Test Accuracy: 0.16033333333333333


In [73]:
print("Train Accuracy:","\n")
print(metrics.classification_report(y_train_dtm,y_train_pred))
print("Test Accuracy:","\n")
print(metrics.classification_report(y_test_dtm,y_test_pred))

Train Accuracy: 

              precision    recall  f1-score   support

           0       1.00      0.93      0.96        84
           1       1.00      0.96      0.98       559
           2       1.00      0.91      0.96       769
           3       1.00      0.95      0.98       875
           4       1.00      0.94      0.97      1378
           5       1.00      0.91      0.95      1381
           6       1.00      0.96      0.98      1073
           7       1.00      0.95      0.97       841
           8       1.00      0.90      0.95       675
           9       1.00      0.97      0.98      1652
          10       1.00      0.94      0.97       522
          11       1.00      0.97      0.98       622
          12       1.00      0.96      0.98      1708
          13       1.00      0.96      0.98      1198
          14       1.00      0.96      0.98        97
          15       1.00      1.00      1.00        58
          16       1.00      0.96      0.98        77
         

**Print true label and predicted label for any five examples**

In [74]:
print("True Test Label:","\n")
print(y_test_dtm.head())
print("\n")
print("Predicted Test Label:","\n")
print(y_test_pred[0:5])

True Test Label: 

   13  14  15  16  17  ...  Transportation  Virgo  female  indUnk  male
0   0   0   0   0   0  ...               0      0       0       0     1
1   0   0   0   0   0  ...               0      0       0       0     1
2   0   0   0   0   0  ...               0      0       0       0     1
3   0   0   0   0   0  ...               0      0       0       0     1
4   0   0   0   0   0  ...               0      0       1       1     0

[5 rows x 76 columns]


Predicted Test Label: 

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
  0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 0

In [0]:
y_test_pred_inverse=mlb.inverse_transform(y_test_pred)

In [84]:
print("True Test Label:","\n")
print(ytest.head(5))
print("\n")
print("Predicted Test Label:","\n")
print(y_test_pred_inverse[0:5])

True Test Label: 

2917      [male, 35, Technology, Aries]
2234      [male, 35, Technology, Aries]
14396    [male, 46, Technology, Gemini]
1781      [male, 35, Technology, Aries]
11102      [female, 33, indUnk, Cancer]
Name: labels, dtype: object


Predicted Test Label: 

[('male',), ('male',), ('Student', 'male'), ('35', 'Aries', 'Technology', 'male'), ('female',)]
