In [1]:
# Operating system
import os
# Regular expression
import re
# Dataframe
import pandas as pd
# Natural language
import nltk
from nltk.corpus import stopwords
# Custom transformers
from sklearn.base import BaseEstimator, TransformerMixin
# Cross-validation
from sklearn.model_selection import train_test_split, GridSearchCV
# Preprocessing
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
from sklearn.decomposition import TruncatedSVD
# Pipeline
from sklearn.pipeline import Pipeline
# Feature selection
from sklearn.feature_extraction.text import TfidfVectorizer
# Multilabel
from sklearn.multiclass import OneVsRestClassifier
# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
# Metrics
from sklearn.metrics import (make_scorer, average_precision_score, f1_score,
                             accuracy_score, recall_score)

In [2]:
# Install nltk data
nltk.download()
# Punkt tokenizer models
nltk.download('punkt')

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Jacek\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Seed
seed = 42

# Import Data

In [4]:
# Getting the Data
path = os.path.join('./data', 'blogtext.csv.zip')
df = pd.read_csv(path, compression='zip')
# Print No of rows & columns
print(f'No of rows: {df.shape[0]}\nNo of columns: {df.shape[1]}')
# Show first 5 rows
df.head()

No of rows: 681284
No of columns: 7


Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


## Get General Information about Data

In [5]:
# Check data
df.info()
# Check the missing data
print(f'Missing cells: {df.isnull().sum().sum()}')
# No missing data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 681284 entries, 0 to 681283
Data columns (total 7 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   id      681284 non-null  int64 
 1   gender  681284 non-null  object
 2   age     681284 non-null  int64 
 3   topic   681284 non-null  object
 4   sign    681284 non-null  object
 5   date    681284 non-null  object
 6   text    681284 non-null  object
dtypes: int64(2), object(5)
memory usage: 36.4+ MB
Missing cells: 0


In [6]:
# Let's describe & inspect dataset
df.describe() 

Unnamed: 0,id,age
count,681284.0,681284.0
mean,2397802.0,23.932326
std,1247723.0,7.786009
min,5114.0,13.0
25%,1239610.0,17.0
50%,2607577.0,24.0
75%,3525660.0,26.0
max,4337650.0,48.0


In [7]:
# Let`s inspect the columns
df.columns 

Index(['id', 'gender', 'age', 'topic', 'sign', 'date', 'text'], dtype='object')

## Remove unnecessary features

In [8]:
# Remove unnecessary features
df.drop(['id','date'], axis=1, inplace=True)
# Show first 5 rows
df.head()

Unnamed: 0,gender,age,topic,sign,text
0,male,15,Student,Leo,"Info has been found (+/- 100 pages,..."
1,male,15,Student,Leo,These are the team members: Drewe...
2,male,15,Student,Leo,In het kader van kernfusie op aarde...
3,male,15,Student,Leo,testing!!! testing!!!
4,male,33,InvestmentBanking,Aquarius,Thanks to Yahoo!'s Toolbar I can ...


## Subset Data to Speed-up Training

In [9]:
# Subset data
df = df.iloc[:100,:]

# Data Preprocessing

## Alphabet Chars

In [10]:
class RemoveNonalpha(BaseEstimator, TransformerMixin):
    """Remove all non-alphabet characters from text
    Attributes:
        X (pd.Series): Column with text
    Methods:
        fit(X): Pass
        transform (X): Remove all non-alphabet chars from X
    """
    def fit(self, X: pd.Series, y=None):
        return self
    def transform(self, X: pd.Series):
        pattern = '[^a-z]+'
        # Remove all non-alphabet chars from string
        X = X.apply(lambda x: re.sub(pattern, ' ', x.lower().strip()))
        return X

## Stopwords

In [11]:
class StopWords(BaseEstimator, TransformerMixin):
    """Remove all english stop words from text
    Attributes:
        X (pd.Series): Column with text
    Methods:
        fit(X): Pass
        transform (X): Remove stop words from X
    """
    def fit(self, X: pd.Series, y=None):
        return self
    def transform(self, X: pd.Series):
        # Remove all stop words, punctuation
        stop_words = set(stopwords.words('english'))
        X = X.apply(lambda x: ' '.join(
            [words for words in x.split() if words not in stop_words]
            ))
        return X

## Lemmatization/Stemming

In [12]:
class Stemming(BaseEstimator, TransformerMixin):
    """Reduce inflection in words to their root forms
    Attributes:
        X (pd.Series): Column with text
    Methods:
        fit(X): Pass
        transform (X): Convert all words from X to the root form 
    """
    def fit(self, X: pd.Series, y=None):
        return self
    def transform(self, X: pd.Series):
        # Lemmatization/Stemming
        porter_stemmer = nltk.PorterStemmer()
        X = X.apply(lambda x: ' '.join(
            [porter_stemmer.stem(words) for words in x.split()]
            ))
        return X

## Prepare X & Y

In [13]:
# Prepare X (text) & y (labels) data
X = df['text']
y = df.apply(lambda x: [x['gender'], str(x['age']), x['sign']], axis=1)

## Binarize Multilabels

In [14]:
# Prepare/binarize multilabel y data
binarizer = MultiLabelBinarizer()
y = binarizer.fit_transform(y)

# Print multilabel classes
print(binarizer.classes_)

['14' '15' '17' '25' '33' 'Aquarius' 'Aries' 'Capricorn' 'Gemini' 'Leo'
 'female' 'male']


## Train & Test Split

In [15]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(
                                        X, y, 
                                        test_size=.2, 
                                        random_state=seed, 
                                    )
print(f'Train: {len(X_train)}, Test: {len(X_test)}')

Train: 80, Test: 20


# Classification Model

## Scores

In [16]:
# Create scoring for models
# Accuracy
accuracy_sc = make_scorer(accuracy_score, greater_is_better=True)
# F1
f1_sc = make_scorer(f1_score, average='weighted', greater_is_better=True)

# All scores together
scoring={
    "acc": accuracy_sc,
    "f1": f1_sc
}

## Pipeline

In [17]:
# Create pipeline
pipeline = Pipeline([
    ('remove', RemoveNonalpha()),
    ('stopwords', StopWords()),
    ('stem', Stemming()),
    ('tfidf', TfidfVectorizer(ngram_range=(1,3))),
    ('scaler', 'passthrough'),
    ('decomposer', 'passthrough'),
    ('classifier', 'passthrough')
])

## Model Hyperparameters

In [18]:
# Grid parameters
param_grid = {
    'scaler': ['passthrough', StandardScaler()
    ],
    'decomposer': ['passthrough', TruncatedSVD()
    ],
    'classifier': [OneVsRestClassifier(LogisticRegression()), 
                 OneVsRestClassifier(RandomForestClassifier()), 
                 OneVsRestClassifier(SVC()),
                 OneVsRestClassifier(xgb.XGBClassifier())
                ]
}

## Grid Search CV

In [19]:
# GridSearchCV
search = GridSearchCV(
            estimator=pipeline, 
            param_grid=param_grid, 
            scoring=scoring,
            refit="f1",
            n_jobs=1,
            cv=5)

## Fit Model

In [20]:
# Fit model
model_fit = search.fit(X_train, y_train)

## Best Model

In [21]:
# Print best parameters
print(f'Best model parameters:\n{model_fit.best_params_}')

Best model parameters:
{'classifier': OneVsRestClassifier(estimator=XGBClassifier(base_score=None, booster=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=None, gamma=None,
                                            gpu_id=None, importance_type='gain',
                                            interaction_constraints=None,
                                            learning_rate=None,
                                            max_delta_step=None, max_depth=None,
                                            min_child_weight=None, missing=nan,
                                            monotone_constraints=None,
                                            n_estimators=100, n_jobs=None,
                                            num_parallel_tree=None,
                                            random_state=None, reg_alpha=None,


## Predict Labels

In [22]:
# Pedict labels
y_pred = model_fit.predict(X_test)
y_pred

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1],
       [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1],
       [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]])

In [23]:
# Invert binarized labels to the readable form
binarizer.inverse_transform(y_pred)[:5]

[('female',),
 ('33', 'Aquarius', 'male'),
 ('33', 'Aquarius', 'male'),
 ('33', 'Aquarius', 'male'),
 ('33', 'Aquarius', 'male')]

## Evaluate Model

In [25]:
# Print scores
print('Accuracy score: ', accuracy_score(y_test, y_pred))
print('F1 score: ', f1_score(y_test, y_pred, average='micro'))
print('Average precision score: ', average_precision_score(y_test, y_pred, average='micro'))
print('Average recall score: ', recall_score(y_test, y_pred, average='micro'))

Accuracy score:  0.7
F1 score:  0.8545454545454546
Average precision score:  0.7905
Average recall score:  0.7833333333333333
