# This script transforms the raw data (numerical, text) into numerical feature representations. 

In [9]:
# general packages
import pandas as pd
import numpy as np
import pickle
from urllib.request import urlopen
import re
import sys
import copy
import pyarrow.parquet as pq
import pyarrow as pa
import time
import os

# pipeline packages
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
import umap.umap_ as umap
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF
from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import PolynomialFeatures


INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2


# Build Pipelines

In [12]:
"""
This function builds a transformer for numerical features 

Inputs:
data (pandas df): master dataframe 
feat (string): single target feature name
log (boolean): if True, log transform the feature

Outputs:
to_transform (list): List of a transformers. Each transformer has the form
('transformer_name', Pipeline, ['feature_name'])
"""

def build_num_transformer(data, feat, log):
    
    to_transform = []
    
    constant = abs(data[feat].min()) + 0.001
        
    if log:
        to_transform.append(('log', Pipeline([('log', FunctionTransformer(lambda x: np.log(x + constant))), 
                                             ('scale', StandardScaler())]), [feat]))
    else:
        to_transform.append(('none', Pipeline([('scale', StandardScaler())]), [feat]))
    
    return to_transform

In [1]:
"""
This function builds a transformer for text features (domains, hashtags, mentions, followees)

Inputs:
feat (string): single target feature name
my_components (integer): best number of components for TruncatedSVD from tuning
my_min_df (integer): best min_df for TF-IDF from tuning

Outputs:
to_transform (list): List of a transformers. Each transformer has the form
('transformer_name', Pipeline, ['feature_name'])
"""
def build_text_transformer(feat, my_components, my_min_df):
    
    to_transform = []
    
    text_pipes = {

        'hashtags': Pipeline([
            ('tfidf', TfidfVectorizer(ngram_range=(1,1),max_df= 1.0,min_df = my_min_df,use_idf=True,binary=False)),
            ('svd', TruncatedSVD(n_components=my_components, random_state=17))
        ]),
        
        'domains': Pipeline([
            ('tfidf', TfidfVectorizer(ngram_range=(1,1),max_df= 1.0,min_df = my_min_df,use_idf=True,binary=False)),
            ('svd', TruncatedSVD(n_components=my_components, random_state=17))
        ]),
        
        'followees': Pipeline([
            ('tfidf', TfidfVectorizer(ngram_range=(1,1),max_df= 1.0,min_df = my_min_df,use_idf=True,binary=True)),
            ('svd', TruncatedSVD(n_components=my_components, random_state=17))
        ]),

        'mentions': Pipeline([
            ('tfidf', TfidfVectorizer(ngram_range=(1,1),max_df= 1.0,min_df = my_min_df,use_idf=True,binary=False)),
            ('svd', TruncatedSVD(n_components=my_components, random_state=17))
        ]),
        
        'bio': Pipeline([
            ('tfidf', TfidfVectorizer(ngram_range=(1,1), max_df= 1.0, min_df=my_min_df, use_idf=True,binary=False)),
            ('svd', TruncatedSVD(n_components=my_components, random_state=17))
        ]),
        
        'follower_bios': Pipeline([
            ('tfidf', TfidfVectorizer(ngram_range=(1,1), max_df= 1.0, min_df=my_min_df, use_idf=True,binary=False)),
            ('svd', TruncatedSVD(n_components=my_components, random_state=17))
        ]),
        
        'followee_bios': Pipeline([
            ('tfidf', TfidfVectorizer(ngram_range=(1,1), max_df= 1.0, min_df=my_min_df, use_idf=True,binary=False)),
            ('svd', TruncatedSVD(n_components=my_components, random_state=17))
        ]),
        
        'text': Pipeline([
            ('tfidf', TfidfVectorizer(ngram_range=(1,1), max_df= 1.0, min_df=my_min_df, use_idf=True,binary=False)),
            ('svd', TruncatedSVD(n_components=my_components, random_state=17))
        ])
        
    }

    to_transform.append((feat, text_pipes[feat], feat))
    
    return to_transform

# Create Dataframe

In [14]:
"""
This function returns a list of feature names

Inputs:
data (pandas df): master dataframe 

Outputs:
feats (list): list of all features from dataframe
"""
def get_features(data):
    feats = list(data.columns)
    feats.remove('screen_name')
    feats.remove('age')
    
    return feats

In [5]:
"""
This function creates master dataframe using pipeline builders and T2V dataframe. 

Inputs:
data (pandas df): master dataframe 
target (string): CRT target feature (numeric, conceptual, or both)
best_components (dictionary of integers): best n_components for TruncatedSVD from tuning
best_mindf (dictionary of lists): best min_df for TF-IDF from tuning
results_path (string): path to store output df

Outputs:
df_i (pandas df): transformed master dataframe 
"""
def create_dataframe(data, target, best_components, best_mindf, results_path, data_type, k_dict={}, l_dict={}):
    
    df_i = pd.DataFrame(data['screen_name'])
    
    features = get_features(data)

    for f_name in features:
            
        if f_name.startswith('CRT'):
            continue
        
        # transform data that is not null 
        data_select = data[['screen_name', f_name]]
        data_in = data_select[data_select[f_name].notnull()] 
        data_nan = data_select[~data_select[f_name].notnull()] 

        # build transformers
        if data[f_name].dtype in [float, int, np.float64, np.int64]:
            skew = data[f_name].skew(axis=0, skipna=True) # check if high or medium skew 
            if -0.5 > skew or 0.5 < skew:
                log = True
            else:
                log = False
            trans = build_num_transformer(data, f_name, log)
            preprocessor = ColumnTransformer(trans, remainder='passthrough')
            output = preprocessor.fit_transform(pd.DataFrame(data_in))
        else:
            output = None
            if data_type == 'test': 
                k = 0
            else:
                k = k_dict[f_name]
            
            while output is None: 
                if data_type == 'test':
                    k = k + 1
                try:                     
                    trans = build_text_transformer(f_name, best_components[f_name][0], best_mindf[f_name][k])
                    preprocessor = ColumnTransformer(trans, remainder='passthrough')
                    output = preprocessor.fit_transform(pd.DataFrame(data_in))
                except:
                    pass
            
            if data_type == 'test':
                l = 0
            else:
                l = l_dict[f_name]
            
            while output.shape[1] != (best_components[f_name][l] + 1):
                if data_type == 'test':
                    l = l + 1
                try:                     
                    trans = build_text_transformer(f_name, best_components[f_name][l], best_mindf[f_name][k])
                    preprocessor = ColumnTransformer(trans, remainder='passthrough')
                    output = preprocessor.fit_transform(pd.DataFrame(data_in))
                except:
                    pass
                
            k_dict[f_name] = k
            l_dict[f_name] = l
        
        cols = []
        
        if data[f_name].dtype in [float, int, np.float64, np.int64]:
            cols.append(f_name)
        else:
            for i in range(len(output[0]) - 1):
                cols.append(f_name + "_" + str(i))
        cols.append('screen_name')
                
        df_temp = pd.DataFrame(output, columns = cols).set_index('screen_name')
        
        for user in data_nan.screen_name:
            if data_type.lower() in ['full', 'dropped']:
                df_temp.loc[user] = np.nan
            elif data_type.lower() == 'imputed':
                df_temp.loc[user] = df_temp.mean()

        df_i = pd.merge(left=df_i, right=df_temp.reset_index(), on='screen_name', how='right')
    
    # add CRT score back in
    target_cols = [i for i in list(data.columns) if i.startswith('CRT')]
    target_cols.append('screen_name')
    
    df_i = pd.merge(left=df_i, right=data[target_cols], on='screen_name', how='right')
    
    # round CRT score for bucket purposes
    for i in df_i.columns:
        if i.startswith('CRT'):
            df_i[i] = df_i[i].apply(lambda x: round(x, 4))
    
    isExist = os.path.exists(results_path)
    if not isExist:
        os.makedirs(results_path)
        
    df_i.to_csv(results_path + "{}.csv".format('data_transformed_' + data_type))
    
    return df_i, k_dict, l_dict

In [4]:
"""
This function merges the old and new data into one master dataframe 

Inputs: 
old_data_link (string): path to old data 
new_data_transformed (pandas df): dataframe containing all new users
results_folder (string): path to folder name

Outputs: 
master_df (pandas df): combined df
"""
def merge_dataframes(old_data_link, new_data_transformed, results_folder, data_type):

    if old_data_link == None:
        master_df = new_data_transformed
    else:
        old_data_transformed = pd.read_csv(old_data_link, index_col=0)
        master_df = pd.concat([old_data_transformed, new_data_transformed], join="inner", ignore_index=True)
    
    # Check whether the specified path exists or not
    isExist = os.path.exists(results_folder)
    if not isExist:
        os.makedirs(results_folder)
    
    master_df.to_csv(results_folder + "{}.csv".format('master_data_' + data_type))
    
    return master_df