In [1]:
# to input data and graphs
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import seaborn as sns
# preprocessing
import random
import pickle
import re
from nltk.corpus import stopwords
# import nltk
# nltk.download('stopwords')
from joblib import dump, load
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
# train tools
from sklearn.ensemble import VotingRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.gaussian_process import GaussianProcessRegressor
from catboost import CatBoostRegressor
# another settings
import warnings
warnings.filterwarnings('ignore')

2022-11-05 19:06:58.178291: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## 1. Import Data

In [2]:
data = pd.read_csv('data/sales.csv', index_col=False)
data.head()

Unnamed: 0,NAME,AGE,MORNING,DAY,EVENING,NIGHT,FIRST TWEET,SECOND TWEET,THIRD TWEET,FOURTH TWEET,PURCHASE AMOUNT
0,Жданова Мария Ивановна,22,1,1,1,1,Q: Вот я Вася как я могла не правильно написа...,"Еду домой в автобусе =""( мочевой пузырь ща лоп...",@DmitryMalikov Радость нежность и тоска чувств...,RT @RaccoonMr: МЕНЬШЕ ЧЕМ ЧЕРЕЗ 2 НЕДЕЛИ КАНИК...,17209
1,Никитин Георгий Родионович,24,1,1,0,0,Будем надеяться на хорошую погоду на выходных ...,869844/1 Я ее просто распихать по шкафам не ус...,Распродажа сумок для ноутбуков ))) Все по 1000...,@galyonkin КАК?! Я просто хочу знать учитывая ...,73248
2,Исаева Анна Георгиевна,24,1,1,1,1,"это 5 хоть и баян) ""@root_sashok: :D http://t....",Новогодняя аудио подборочка) Забираем на стену...,@irina33371 ахахахх я заметила: отмычки перчат...,"Что делать в случае ""никто-тебя-не-любит""? ДЕЛ...",24312
3,Сидоров Олег Алексеевич,21,1,1,0,0,@_StalArt_ очень верное и мудрое решениетак ск...,@DyagilevaNastya @3DG_rock7 любовный треугольн...,@controlflow Когда упускаю-то? в #1? Это ты пр...,RT @4ayanmusiq: В декабре я снова посещу Казан...,71999
4,Николаева Юлия Матвеевна,20,0,0,1,0,RT @ksana_semenova: эти чихающие детки в школе...,#ХочуПровестиНовогоднююНочьВместеС @coldiumtea...,Прогуливаю английский чтобы пойти на факультат...,@pani_walewska @rubis32 @maxbryansk ждете сигн...,35056


## 2. Data Preprocessing

In [3]:
X = shuffle(data)
y = X.pop('PURCHASE AMOUNT')

## 3. Build Models

In [4]:
MAX_SEQUENCE_LENGTH = 140
MAX_SEQUENCE_LENGTH_NAMES = 36

vocab_size = 152836
vocab_size_names = 5793

def model_name():
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Embedding(vocab_size_names, 16, input_length=MAX_SEQUENCE_LENGTH_NAMES))
    model.add(tf.keras.layers.GlobalAveragePooling1D())
    model.add(tf.keras.layers.Dense(64, activation=tf.nn.relu, kernel_regularizer=tf.keras.regularizers.L2(0.005)))
    model.add(tf.keras.layers.Dense(32, activation=tf.nn.relu))
    model.add(tf.keras.layers.Dense(8, activation=tf.nn.relu))
    model.add(tf.keras.layers.Dense(1, activation=tf.nn.sigmoid))
    return model

def model_tweet():
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Embedding(vocab_size, 16, input_length=MAX_SEQUENCE_LENGTH))
    model.add(tf.keras.layers.GlobalAveragePooling1D())
    model.add(tf.keras.layers.Dense(64, activation=tf.nn.relu, kernel_regularizer=tf.keras.regularizers.L2(0.005)))
    model.add(tf.keras.layers.Dropout(0.4))
    model.add(tf.keras.layers.Dense(32, activation=tf.nn.relu))
    model.add(tf.keras.layers.Dense(8, activation=tf.nn.relu))
    model.add(tf.keras.layers.Dense(1, activation=tf.nn.sigmoid))
    return model

stopwords_rus = set(stopwords.words("russian"))

def clean_text(text):
    text = text.lower() # convert to lowercase
    text = re.sub("[^а-я]", " ", text)
    words = [word for word in text.split() if word not in stopwords_rus]
    text = " ".join(words)
    return text

# loading
with open('pretrained_models/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)
    
with open('pretrained_models/tokenizer_names.pickle', 'rb') as handle:
    tokenizer_names = pickle.load(handle)

## 4. Custom Regressor

In [5]:
# SEED for reproducible result
from tqdm import tqdm
SEED = 101
mod_names_path = "pretrained_models/training_names/cp.ckpt"
mod_tweet_path = "pretrained_models/training_tweets/cp.ckpt"

class CustomRegressor():
    
    def __init__(self, n_estimators=None, min_weight_fraction_leaf=None, max_leaf_nodes=None):
        
        self.n_estimators = n_estimators
        self.min_weight_fraction_leaf = min_weight_fraction_leaf
        self.max_leaf_nodes = max_leaf_nodes
    
    def _clearText_(self, X, columns):
        
        for column in columns:
            X[column] = X[column].map(clean_text)
        return X
    
    def _predProf_(self, X):
        
        data = X[['MORNING', 'DAY', 'EVENING', 'NIGHT']]
        clf = load('pretrained_models/prof_clf.joblib') 
        predicted_profs = clf.predict(data)
        X['PROFESSION'] = predicted_profs
        return X
    
    def _predCondition_(self, X):
        
        model = model_tweet()
        model.load_weights(mod_tweet_path)
        tweets = ['FIRST TWEET', 'SECOND TWEET', 'THIRD TWEET', 'FOURTH TWEET']
        condition = pd.DataFrame(0, index=np.arange(len(X)), columns={0: 'CONDITION'})
        for tweet in tweets:
            text_sequences = tokenizer.texts_to_sequences(X[tweet])
            tweet_texts = pad_sequences(text_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
            predicted_cond = np.rint(model.predict(tweet_texts)).astype('int32')
            condition += predicted_cond
        condition = condition.replace(0, 1)
        X['CONDITION'] = 0
        for i, num in enumerate(condition):
            X['CONDITION'].iloc[i] = condition.iloc[i]
        return X
    
    def _predGender_(self, X):
        
        data = X['NAME']
        model = model_name()
        model.load_weights(mod_names_path)
        text_sequences = tokenizer_names.texts_to_sequences(data)
        names = pad_sequences(text_sequences, maxlen=MAX_SEQUENCE_LENGTH_NAMES, padding='post')
        predicted_genders = np.rint(model.predict(names)).astype('int32')
        X['GENDER'] = predicted_genders
        return X
    
    def _prepare_(self, X):
        
        X1 = self._clearText_(X, ['FIRST TWEET', 'SECOND TWEET', 'THIRD TWEET', 'FOURTH TWEET', 'NAME'])
        X2 = self._predProf_(X1)
        X3 = self._predCondition_(X2)
        X4 = self._predGender_(X3)
        X5 = X4.drop(['NAME', 'MORNING', 'DAY', 'EVENING', 'NIGHT', 'FIRST TWEET', 'SECOND TWEET', 'THIRD TWEET', 'FOURTH TWEET'], axis=1)
        return X5
        
    def _estimator_(self, X, y):
        warnings.simplefilter(action='ignore', category=FutureWarning)
            
        GausProcess = make_pipeline(
            TransformedTargetRegressor(
                regressor = GaussianProcessRegressor(random_state=SEED),
                func=np.log1p,
                inverse_func=np.expm1)
        )
            
        CatBoost = make_pipeline(
            TransformedTargetRegressor(
                regressor = CatBoostRegressor(random_state=SEED, verbose=0),
                func=np.log1p,
                inverse_func=np.expm1)
        )
            
        RandForest = make_pipeline(
            TransformedTargetRegressor(
                regressor = RandomForestRegressor(max_leaf_nodes=60, max_depth=9, random_state=SEED),
                func=np.log1p,
                inverse_func=np.expm1)
        )
        
        estimators = [
            ('GaussProcess', GausProcess),
            ('CatBoost', CatBoost),
            ("RandForest", RandForest)
        ]
            
        model = VotingRegressor(estimators)
            
        model.fit(X, y)
        return model
    
    def fit(self, X, y):
        print("Fit stage...")
        X = self._prepare_(X)
        self.estimators_ = []
        estimator_ = self._estimator_(X, y)
        self.estimators_.append(estimator_)
    
    def predict(self, X):
        print("Prediction stage...")
        X = self._prepare_(X)
        y_pred = []
        for est in tqdm(self.estimators_):
            y_pred_ = est.predict(X)
            y_pred.append(y_pred_)
            
        return np.stack(y_pred, axis=1)

## 5. Train Model

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=228)

In [7]:
model = CustomRegressor()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

Fit stage...


2022-11-05 19:07:02.517174: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Prediction stage...


100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.36it/s]


## 6. Predict

In [8]:
predictions = model.predict(X_test)
predictions = predictions.astype('int')

Prediction stage...


100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.23it/s]


In [9]:
import sklearn.metrics as sm
from joblib import load
print("Mean absolute error =", round(sm.mean_absolute_error(y_test, predictions), 2)) 
print("Mean squared error =", round(sm.mean_squared_error(y_test, predictions), 2)) 
print("Median absolute error =", round(sm.median_absolute_error(y_test, predictions), 2)) 
print("Explain variance score =", round(sm.explained_variance_score(y_test, predictions), 2)) 
print("R2 score =", round(sm.r2_score(y_test, predictions), 2))

Mean absolute error = 10029.05
Mean squared error = 185528691.13
Median absolute error = 7902.5
Explain variance score = 0.86
R2 score = 0.86
