In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Import the Data

In [None]:
import pandas as pd
import numpy as np

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
df_temp = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Project/Dataset - blogtext.csv')

In [None]:
df_temp.shape

(681284, 7)

In [None]:
# taking a sample as saving the memory
df = df_temp.head(50000)

In [None]:
del df_temp

In [None]:
df.sample(10)

Unnamed: 0,id,gender,age,topic,sign,date,text
42019,888601,female,24,indUnk,Libra,"06,September,2003","Alrighty then, I've finished with two c..."
11238,3350684,male,15,indUnk,Capricorn,"16,July,2004",today is racial harmony day.. then got ...
6203,1708471,female,17,indUnk,Cancer,"09,May,2004",
30495,3687738,male,47,indUnk,Aries,"12,agosto,2004",China able to handle oil's jump Cheap ...
11092,2061087,female,33,indUnk,Cancer,"11,May,2004","Wow, so blogger has made some big chang..."
16478,1915813,male,27,indUnk,Sagittarius,"02,August,2004",Doom 3 leaked Just read that ...
42662,3511005,female,27,Accounting,Scorpio,"10,August,2004","So today is wonderful, other than allergies..."
40609,3581933,female,25,Banking,Sagittarius,"17,July,2004",we met our neighbors from across th...
37911,3636103,male,26,indUnk,Aquarius,"09,August,2004","10. You left in your 39 x 23, an..."
9740,936098,male,17,Student,Gemini,"03,August,2004",Wait. What?


In [None]:
df.shape

(50000, 7)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      50000 non-null  int64 
 1   gender  50000 non-null  object
 2   age     50000 non-null  int64 
 3   topic   50000 non-null  object
 4   sign    50000 non-null  object
 5   date    50000 non-null  object
 6   text    50000 non-null  object
dtypes: int64(2), object(5)
memory usage: 2.7+ MB


In [None]:
# Statistical summary
df.describe(include = 'all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
id,50000,,,,2388440.0,1254200.0,17944.0,1078080.0,2764900.0,3525620.0,4337130.0
gender,50000,2.0,male,25815.0,,,,,,,
age,50000,,,,24.3621,7.98284,13.0,17.0,24.0,27.0,48.0
topic,50000,40.0,indUnk,17560.0,,,,,,,
sign,50000,12.0,Aries,7795.0,,,,,,,
date,50000,1180.0,"05,August,2004",2600.0,,,,,,,
text,50000,49249.0,urlLink,48.0,,,,,,,


### Data Pre-proecessing

In [None]:
df.isnull().sum() # check the null value

id        0
gender    0
age       0
topic     0
sign      0
date      0
text      0
dtype: int64

In [None]:
df.isnull().values.any()

False

In [None]:
df = df.drop(columns = ['id', 'date'], axis = 1) # Dropping the column

In [None]:
df['age'].unique()

array([15, 33, 14, 25, 17, 23, 37, 26, 24, 27, 45, 34, 41, 44, 16, 39, 35,
       36, 46, 42, 13, 38, 43, 40, 47, 48])

In [None]:
df['gender'].unique()

array(['male', 'female'], dtype=object)

In [None]:
df['topic'].unique()

array(['Student', 'InvestmentBanking', 'indUnk', 'Non-Profit', 'Banking',
       'Education', 'Engineering', 'Science', 'Communications-Media',
       'BusinessServices', 'Sports-Recreation', 'Arts', 'Internet',
       'Museums-Libraries', 'Accounting', 'Technology', 'Law',
       'Consulting', 'Automotive', 'Religion', 'Fashion', 'Publishing',
       'Marketing', 'LawEnforcement-Security', 'HumanResources',
       'Telecommunications', 'Military', 'Government', 'Transportation',
       'Architecture', 'Advertising', 'Agriculture', 'Biotech',
       'RealEstate', 'Manufacturing', 'Construction', 'Chemicals',
       'Maritime', 'Tourism', 'Environment'], dtype=object)

In [None]:
df['sign'].unique()

array(['Leo', 'Aquarius', 'Aries', 'Capricorn', 'Gemini', 'Cancer',
       'Sagittarius', 'Scorpio', 'Libra', 'Virgo', 'Taurus', 'Pisces'],
      dtype=object)

In [None]:
len(df['text'].unique())

49249

In [None]:
df['text']

0                   Info has been found (+/- 100 pages,...
1                   These are the team members:   Drewe...
2                   In het kader van kernfusie op aarde...
3                         testing!!!  testing!!!          
4                     Thanks to Yahoo!'s Toolbar I can ...
                               ...                        
49995           Aug 7th Thur... Bought Her Mua Chee & S...
49996           Aug 6th Wed.. Her 1st Day @ Work Back @...
49997           Aug 4th Mon Zing's BD !! Went To Her Pl...
49998           Aug 3rd Sun.. Went To Her Place B4 Goin...
49999           Aug 1st Fri.. Met Her To Go Shoppin' @ ...
Name: text, Length: 50000, dtype: object

#### Data Cleansing

In [None]:
import re

In [None]:
df['text'] = df['text'].apply(lambda x : re.sub('[^A-Za-z]+',  ' ', x))

In [None]:
df['text'] = df['text'].apply(lambda x : x.strip())

In [None]:
df['text'] = df['text'].apply(lambda x : x.lower())

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
sw = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
df['text'] = df['text'].apply(lambda x : ' '.join([word for word in x.split() if word not in sw]))

In [None]:
df

Unnamed: 0,gender,age,topic,sign,text
0,male,15,Student,Leo,info found pages mb pdf files wait untill team...
1,male,15,Student,Leo,team members drewes van der laag urllink mail ...
2,male,15,Student,Leo,het kader van kernfusie op aarde maak je eigen...
3,male,15,Student,Leo,testing testing
4,male,33,InvestmentBanking,Aquarius,thanks yahoo toolbar capture urls popups means...
...,...,...,...,...,...
49995,male,23,Advertising,Taurus,aug th thur bought mua chee vcds send home work
49996,male,23,Advertising,Taurus,aug th wed st day work back dw sent work sent ...
49997,male,23,Advertising,Taurus,aug th mon zing bd went place cooked dinner to...
49998,male,23,Advertising,Taurus,aug rd sun went place b goin get zing bd prese...


#### Merger

In [None]:
# %%time
# df['target'] = df[df.columns[:-1]].apply(lambda x : ','.join(x.astype(str)), axis = 1)

In [None]:
df['target'] = df.apply(lambda row: [row['gender'], str(row['age']), row['topic'], row['sign']], axis = 1)

In [None]:
# df.reset_index(drop = True, inplace = True)

In [None]:
df.head()

Unnamed: 0,gender,age,topic,sign,text,target
0,male,15,Student,Leo,info found pages mb pdf files wait untill team...,"[male, 15, Student, Leo]"
1,male,15,Student,Leo,team members drewes van der laag urllink mail ...,"[male, 15, Student, Leo]"
2,male,15,Student,Leo,het kader van kernfusie op aarde maak je eigen...,"[male, 15, Student, Leo]"
3,male,15,Student,Leo,testing testing,"[male, 15, Student, Leo]"
4,male,33,InvestmentBanking,Aquarius,thanks yahoo toolbar capture urls popups means...,"[male, 33, InvestmentBanking, Aquarius]"


In [None]:
df_new = df[['text', 'target']]
df_new.head(2)

Unnamed: 0,text,target
0,info found pages mb pdf files wait untill team...,"[male, 15, Student, Leo]"
1,team members drewes van der laag urllink mail ...,"[male, 15, Student, Leo]"


#### Train & Split

In [None]:
x = df_new['text']
y = df_new['target']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)

In [None]:
X_train.shape, X_test.shape

((35000,), (15000,))

### Vectorisation

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vc = CountVectorizer(ngram_range = (1,2), min_df = 1, stop_words = 'english')

In [None]:
X_train_vc = vc.fit_transform(X_train)
X_test_vc = vc.transform(X_test)

In [None]:
vc.get_feature_names()[:10]

['aa',
 'aa aa',
 'aa anger',
 'aa batteries',
 'aa class',
 'aa compared',
 'aa damn',
 'aa ended',
 'aa eriol',
 'aa forms']

### Classifier

In [None]:
target_count = dict()

for target in df['target'].values:
  for label in target:
    if label in target_count:
      target_count[label] += 1
    else:
      target_count[label] = 1

In [None]:
target_count

{'13': 745,
 '14': 2043,
 '15': 3508,
 '16': 4156,
 '17': 6859,
 '23': 5518,
 '24': 5746,
 '25': 2837,
 '26': 2869,
 '27': 4094,
 '33': 1654,
 '34': 1886,
 '35': 3365,
 '36': 1985,
 '37': 310,
 '38': 196,
 '39': 412,
 '40': 192,
 '41': 394,
 '42': 96,
 '43': 150,
 '44': 38,
 '45': 93,
 '46': 330,
 '47': 206,
 '48': 318,
 'Accounting': 364,
 'Advertising': 273,
 'Agriculture': 78,
 'Aquarius': 4784,
 'Architecture': 70,
 'Aries': 7795,
 'Arts': 1817,
 'Automotive': 116,
 'Banking': 283,
 'Biotech': 101,
 'BusinessServices': 416,
 'Cancer': 4589,
 'Capricorn': 3819,
 'Chemicals': 75,
 'Communications-Media': 1603,
 'Construction': 28,
 'Consulting': 243,
 'Education': 2646,
 'Engineering': 1402,
 'Environment': 6,
 'Fashion': 1805,
 'Gemini': 2558,
 'Government': 599,
 'HumanResources': 79,
 'Internet': 1420,
 'InvestmentBanking': 85,
 'Law': 308,
 'LawEnforcement-Security': 125,
 'Leo': 3904,
 'Libra': 4378,
 'Manufacturing': 441,
 'Maritime': 54,
 'Marketing': 414,
 'Military': 194,
 '

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
mlb = MultiLabelBinarizer(classes = sorted(target_count.keys()))

In [None]:
y_train = mlb.fit_transform(y_train)
y_test = mlb.transform(y_test)

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
clf = LogisticRegression(solver = 'lbfgs', max_iter = 1000)
clf = OneVsRestClassifier(clf)

In [None]:
%%time
clf.fit(X_train_vc, y_train)

CPU times: user 2h 25min 16s, sys: 1h 14min 55s, total: 3h 40min 12s
Wall time: 2h 2min 35s


OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=1000,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [None]:
predicted_label = clf.predict(X_test_vc)
predicted_score = clf.decision_function(X_test_vc)

In [None]:
predicted_label.shape, predicted_score.shape

((15000, 80), (15000, 80))

In [None]:
prediction_inverse = mlb.inverse_transform(predicted_label)
y_test_inv = mlb.inverse_transform(y_test)

### Classification Report

In [None]:
from sklearn.metrics import accuracy_score, f1_score, average_precision_score, recall_score

In [None]:
accuracy_score(y_test, predicted_label)

0.14193333333333333

In [None]:
f1_score(y_test, predicted_label, average = 'weighted')

0.4685028168556825

In [None]:
average_precision_score(y_test, predicted_label, average = 'weighted')

0.37982961022987755

In [None]:
recall_score(y_test, predicted_label, average = 'weighted')

0.38715

We can see that the model has performed poorly. The accuracy score is not sufficient because while we clipped the dataset, we've lost more than half of the information and it might contain some unbalanced data. Therefore, for the improvement, we need to add more or work on the full details to the dataset, but it's possible when we have high computing power.

### True vs Predicted

In [None]:
for i in range(5):
  print("title: {}\ntrue labels: {}\npredicted labels: {}\n".format(X_test_vc[i], 
                                                                      '.'.join(y_test_inv[i]), 
                                                                      '.'.join(prediction_inverse[i])))

title:   (0, 12807)	1
  (0, 38097)	1
  (0, 108696)	1
  (0, 174124)	1
  (0, 175683)	1
  (0, 417699)	1
  (0, 421529)	1
  (0, 496809)	1
  (0, 496841)	1
  (0, 562154)	1
  (0, 562397)	1
  (0, 609147)	2
  (0, 610020)	1
  (0, 620567)	1
  (0, 621242)	1
  (0, 625407)	1
  (0, 625509)	1
  (0, 645459)	1
  (0, 646170)	1
  (0, 698081)	1
  (0, 706908)	1
  (0, 707160)	1
  (0, 776225)	1
  (0, 776798)	1
  (0, 811259)	1
  :	:
  (0, 1652639)	1
  (0, 1652789)	1
  (0, 1653197)	1
  (0, 1653208)	1
  (0, 1653233)	1
  (0, 1821285)	1
  (0, 1823730)	1
  (0, 1832274)	1
  (0, 1832292)	1
  (0, 1854975)	1
  (0, 1855948)	1
  (0, 1862544)	1
  (0, 1903037)	1
  (0, 1903627)	1
  (0, 1974704)	1
  (0, 1974875)	1
  (0, 1984633)	1
  (0, 1985888)	1
  (0, 1990065)	2
  (0, 1991989)	1
  (0, 1992996)	1
  (0, 2033475)	1
  (0, 2033567)	1
  (0, 2056068)	1
  (0, 2056432)	1
true labels: 37.Education.Gemini.female
predicted labels: male

title:   (0, 64752)	1
  (0, 64758)	1
  (0, 64780)	1
  (0, 81906)	1
  (0, 125595)	1
  (0, 125791)	1
 

Some of them have predicted the wrong because of the low performance of the model.

### Conclusion

First, we imported the data, then take a sample to avoid crashing. Then performed the pre-processing, cleansing and transformed it. After that, we split the data into train and test, plus applied the vectorisations. And then tune it to make it a classifier. However, the dataset hasn't performed well as we expected. As we've discussed before, we need to have a high computational power to overcome this issue.