In [1]:
#!/usr/bin/env python
# coding: utf-8

if 'config' not in locals():
    config = 'configs.config_ILLAE_local'
#     config = 'configs.config_WomenInEngineering_local'
#     config = 'configs.config_Digits_local'        

In [2]:
### initialization ###

import importlib
print('config file is set to {}'.format(config))

imp=importlib.import_module('_imports')
importlib.reload(imp)

util=importlib.import_module('_utils')
importlib.reload(util)

c=importlib.import_module(config)
importlib.reload(c)

get_ipython().run_line_magic('matplotlib', 'inline')

# param_setup(sys.argv[1:], c)
util.param_setup_ipython(globals(), c)

config file is set to configs.config_ILLAE_local


In [3]:
### reading experiment results from pickled files ###

dbfile = open('{}tmp/{}_db_result_cvs_1_from_{}.pickle'.format(c.directory['save'],
                                                               util.experiment_name(c),
                                                               c.index_step
                                                              ), 'rb') 
result_cvs=imp.pickle.load(dbfile)
dbfile.close()

for i in range(2, c.index_step+1):
    dbfile = open('{}tmp/{}_db_result_cvs_{}_from_{}.pickle'.format(c.directory['save'],
                                                           util.experiment_name(c),
                                                           i,
                                                           c.index_step
                                                              ), 'rb') 
    result_cvs.extend(imp.pickle.load(dbfile))
    dbfile.close()

imp.pprint('number of experiments = {}'.format(len(result_cvs)))

'number of experiments = 150'


In [4]:
### functions for metric evaluation ###

def metric_eval(pred_class, true_class, class_labels):
    accuracy=0
    for pred, true in zip(pred_class, true_class):
        if pred == true:
            accuracy += 1
    accuracy /= len(pred_class)
    
    F1_score=dict()
    for c in class_labels:
        TP = 0
        FP = 0
        FN = 0
        for pred, true in zip(pred_class, true_class):
            if pred == true and pred == c:
                TP += 1
            elif pred != true and pred == c:
                FP += 1
            elif pred != true and true == c:
                FN += 1
        if TP == 0:
            F1_score[c] = 0
            continue

        precision = TP/(TP+FP)
        recall = TP/(TP+FN)
        F1_score[c] = 2 / (1/precision + 1/recall)
        
    return dict(
        accuracy=accuracy, 
        F1_score=F1_score
    )
    
def create_list_of_results(df_to_show):
    df_to_show= imp.pd.concat([imp.pd.DataFrame([e['eval']['F1_score'] for e in df_to_show]), 
                               imp.pd.DataFrame(dict(accuracy=[e['eval']['accuracy'] for e in df_to_show])), 
                               imp.pd.DataFrame([e['params'] for e in df_to_show])
                              ], axis=1)
    df_to_show.columns=[str(label) for label in list(df_to_show)]
    return df_to_show

In [5]:
### calculating evaluation metrics for each experiment ###

dfs=dict()
dfs['labels']=imp.pd.read_csv('{}data/{}_2_data_labels.csv'.format(c.directory['save'], c.project_name))
# class_labels = [str(label) for label in set(dfs['labels']['label'])]
class_labels = set(dfs['labels']['label'])

keys=list(result_cvs[0]['experiment'].keys())
del keys[keys.index('cv')]
result_cvs=sorted(result_cvs,
                  key=lambda i:[i['experiment'][col] for col in keys]
                 )    
experiment_results=[]

for i in range(0, len(result_cvs), c.inner):
    pred_list=[]
    true_list=[]

    for j in range(c.inner):
        pred_list.extend(result_cvs[i+j]['pred'])
        true_list.extend(result_cvs[i+j]['true'])

    params=result_cvs[i+j]['experiment']
    params.pop('cv', None)
    experiment_results.append({
        'params':params
        , 'eval':metric_eval(pred_list, true_list, class_labels)
    })
 
params_search_results=create_list_of_results(experiment_results)

params_search_results.to_csv('{}experiments/{}/2_parameters_cv_results.csv'.format(c.directory['save'],
                                                                                   util.experiment_name(c)),
                             index=False)
util.display_df(params_search_results.head(100))

Unnamed: 0,False,True,accuracy,coef0,cost,degree,full_text_LDA_num_topics,full_text_LDA_topic_cutoff_threshold,full_text_no_above,full_text_no_below,gamma,hashtags_community_feature_type,hashtags_community_graph_type,kernel,lda_hashtags_LDA_num_topics,lda_hashtags_LDA_topic_cutoff_threshold,lda_hashtags_no_above,lda_hashtags_no_below
0,0.638035,0.63821,0.638123,0.0,1,1.0,20,0,1,1,0.01,weighted,weighted,sigmoid,20,0,1,1
1,0.570739,0.568344,0.569545,0.0,1,1.0,20,0,1,1,1.0,weighted,weighted,sigmoid,20,0,1,1
2,0.668162,0.687111,0.677915,0.0,1,2.0,20,0,1,1,0.01,weighted,weighted,poly,20,0,1,1
3,0.613057,0.601081,0.60716,0.0,1,2.0,20,0,1,1,1.0,weighted,weighted,poly,20,0,1,1
4,0.676082,0.711558,0.694848,0.0,1,3.0,20,0,1,1,0.01,weighted,weighted,poly,20,0,1,1
5,0.650757,0.651854,0.651306,0.0,1,3.0,20,0,1,1,1.0,weighted,weighted,poly,20,0,1,1
6,0.624393,0.627649,0.626028,0.0,10,1.0,20,0,1,1,0.01,weighted,weighted,sigmoid,20,0,1,1
7,0.568845,0.566125,0.567489,0.0,10,1.0,20,0,1,1,1.0,weighted,weighted,sigmoid,20,0,1,1
8,0.646441,0.637433,0.641993,0.0,10,2.0,20,0,1,1,0.01,weighted,weighted,poly,20,0,1,1
9,0.618087,0.603206,0.610789,0.0,10,2.0,20,0,1,1,1.0,weighted,weighted,poly,20,0,1,1


In [6]:
### finding the optimal paramters and saving ###

max_results=list()

max_result=imp.copy.deepcopy(params_search_results.sort_values('accuracy', ascending=False).iloc[0])
max_result=max_result.append(imp.pd.Series(['accuracy'], index=['metric']))
max_results.append(max_result)

for label in list(map(str, class_labels)):
#     print(label)
    max_result=imp.copy.deepcopy(params_search_results.sort_values(label, ascending=False).iloc[0])
    max_result=max_result.append(imp.pd.Series([label], index=['metric']))
    max_results.append(max_result)

max_results_df=imp.pd.DataFrame(max_results)
max_results_list=[]
for max_result in list(map(dict, max_results)):
    max_result['cv']='all'
    max_results_list.append(max_result)
index=['accuracy']
index.extend(list(map(str, class_labels)))
max_results_df.index=index
max_results_df.to_csv('{}experiments/{}/3_optimal_parameters.csv'.format(c.directory['save'],
                                                                         util.experiment_name(c)),
                      index=True)    
util.display_df(max_results_df)

Unnamed: 0,False,True,accuracy,coef0,cost,degree,full_text_LDA_num_topics,full_text_LDA_topic_cutoff_threshold,full_text_no_above,full_text_no_below,gamma,hashtags_community_feature_type,hashtags_community_graph_type,kernel,lda_hashtags_LDA_num_topics,lda_hashtags_LDA_topic_cutoff_threshold,lda_hashtags_no_above,lda_hashtags_no_below,metric
accuracy,0.710666,0.709263,0.709966,10.0,1,2.0,20,0,1,1,0.01,weighted,weighted,poly,20,0,1,1,accuracy
False,0.710666,0.709263,0.709966,10.0,1,2.0,20,0,1,1,0.01,weighted,weighted,poly,20,0,1,1,False
True,0.685251,0.712633,0.699565,1.0,10,1.0,20,0,1,1,1.0,weighted,weighted,linear,20,0,1,1,True


In [7]:
### reading class labels and concatenating features and original data ###

dfs=dict()

dfs['labels'] = imp.pd.read_csv('{}data/{}_2_data_labels.csv'.format(c.directory['save'], c.project_name))
dfs['features']=util.construct_static_features(c)
if len(dfs['features']) == 0:
    dfs['features'] = imp.pd.DataFrame(dfs['labels'][c.id_name])
    
imp.pprint('labels = {}'.format(dfs['labels'].shape))
util.display_df(dfs['labels'].head())

imp.pprint('featuers = {}'.format(dfs['features'].shape))
util.display_df(dfs['features'].head())

'labels = (17354, 2)'


Unnamed: 0,tweet_id,label
0,637005633548632064,False
1,637007331574812673,True
2,637012950495031296,True
3,637012998473670656,True
4,637014738216443905,False


'featuers = (17354, 108)'


Unnamed: 0,tweet_id,friends_count,followers_count,favourites_count,statuses_count,profile_created_at_date,listed_count,verified,text,external_url,external_media_type,user_mentions,in_reply_to_user_id,hashtags_count,hashtags_sum,WC,Analytic,Clout,Authentic,Tone,WPS,Sixltr,Dic,function,pronoun,ppron,i,we,you,shehe,they,ipron,article,prep,auxverb,adverb,conj,negate,verb,adj,compare,interrog,number,quant,affect,posemo,negemo,anx,anger,sad,social,family,friend,female,male,cogproc,insight,cause,discrep,tentat,certain,differ,percept,see,hear,feel,bio,body,health,sexual,ingest,drives,affiliation,achieve,power,reward,risk,focuspast,focuspresent,focusfuture,relativ,motion,space,time,work,leisure,home,money,relig,death,informal,swear,netspeak,assent,nonflu,filler,AllPunc,Period,Comma,Colon,SemiC,QMark,Exclam,Dash,Quote,Apostro,Parenth,OtherP
0,637005633548632064,34,946,78,1127,1586,51,0,131,1.0,0.0,0.0,0.0,2,14.5,21,92.84,92.33,1.0,95.81,21.0,23.81,61.9,38.1,9.52,0.0,0.0,0.0,0.0,0.0,0.0,9.52,4.76,19.05,9.52,4.76,0.0,0.0,9.52,4.76,4.76,4.76,0.0,0.0,4.76,4.76,0.0,0.0,0.0,0.0,14.29,0.0,0.0,4.76,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.52,4.76,0.0,4.76,0.0,0.0,0.0,14.29,0.0,4.76,0.0,4.76,0.0,4.76,0.0,0.0,0.0,0.0,0.0,4.76,0.0,4.76,0.0,0.0,0.0,33.33,4.76,0.0,4.76,0.0,0.0,0.0,0.0,0.0,4.76,0.0,19.05
1,637007331574812673,196,253,117,672,3175,11,0,139,0.0,0.0,1.0,0.0,1,19.0,21,92.84,99.0,68.01,99.0,10.5,28.57,80.95,42.86,14.29,9.52,0.0,9.52,0.0,0.0,0.0,4.76,9.52,14.29,0.0,4.76,4.76,0.0,4.76,9.52,4.76,0.0,0.0,0.0,9.52,9.52,0.0,0.0,0.0,0.0,14.29,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.81,9.52,9.52,4.76,4.76,0.0,4.76,4.76,0.0,23.81,4.76,0.0,23.81,9.52,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.81,4.76,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.76,0.0,14.29
2,637012950495031296,1287,13,2,3,2316,2,0,43,0.0,1.0,0.0,0.0,1,19.0,6,92.84,50.0,1.0,25.77,6.0,33.33,16.67,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.67,0.0,16.67,0.0,0.0,0.0,66.67,16.67,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0
3,637012998473670656,464,494,2341,8345,2465,2,0,67,0.0,1.0,1.0,1.0,1,19.0,9,92.84,50.0,1.0,25.77,9.0,33.33,11.11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.11,0.0,11.11,0.0,0.0,0.0,77.78,11.11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,66.67
4,637014738216443905,416,206,16,85,2904,7,0,136,0.0,1.0,1.0,0.0,2,13.5,19,92.84,70.08,50.35,25.77,9.5,36.84,47.37,10.53,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.26,0.0,0.0,5.26,0.0,0.0,5.26,5.26,5.26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.26,0.0,0.0,0.0,0.0,5.26,0.0,5.26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.53,5.26,0.0,5.26,0.0,0.0,10.53,5.26,0.0,21.05,10.53,0.0,10.53,5.26,5.26,0.0,0.0,0.0,0.0,5.26,0.0,5.26,0.0,0.0,0.0,36.84,10.53,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26.32


In [8]:
### reading cv ids for cv folds ###

outer_ids = imp.pd.read_csv('{}{}'.format(c.directory['save'], 'cv_ids/outer_0.csv'))
outer_ids.rename(columns={'id': c.id_name}, inplace=True)

inner_ids = list()
for i in range(c.inner):
    inner_ids.append(imp.pd.read_csv('{}cv_ids/inner_0_{}.csv'.format(c.directory['save'], i)))

imp.pprint('Number of cv = {}'.format(len(inner_ids)))

'Number of cv = 5'


In [9]:
### function for training and testing the optimal parameters ###

def optimal_model_training_testing(optimal_params, df_features, df_labels, df_inputs, dynamics):
    training = imp.copy.deepcopy(inner_ids)
    training_ids=imp.pd.DataFrame()
    for item in training:
        training_ids=imp.pd.concat([training_ids, item], ignore_index=True, axis=0)
    training_ids.rename(columns={'id':c.id_name}, inplace=True)
    testing_ids = imp.copy.deepcopy(outer_ids)
        
    for feature in c.select_features['dynamic']:
        dynamic=dynamics[c.features['dynamic'][feature]['type']]
        print(c.features['dynamic'][feature]['type'])
        df_input=df_inputs[feature]
        df_training=imp.pd.merge(df_input, training_ids, on=c.id_name, how='right')
        df_testing=imp.pd.merge(df_input, testing_ids, on=c.id_name, how='right')        
        optimal_params_temp=imp.copy.deepcopy(optimal_params)
        optimal_params_temp['name']=feature
        optimal_params_temp['type']=c.features['dynamic'][feature]['type']
        
        df_dynamic=dynamic.construct_features(c, optimal_params_temp, feature,
                                              df_training, df_testing,
                                              training_ids, testing_ids)
#         df_dynamic=dynamic.retrieve_features(c, optimal_params_temp, feature)
#         util.display(df_dynamic.head())

        df_features=imp.pd.merge(df_features, df_dynamic, on=c.id_name, how='right')

    result_training = util.data_prepropressing(training_ids, 
                                               df_features,
                                               df_labels,
                                               c
                                              )

    result_testing = util.data_prepropressing(testing_ids, 
                                         df_features, 
                                         df_labels,
                                         c,
                                         result_training['imputer'], 
                                         result_training['scaler']
                                        )    

    classifier = imp.SVC(C=optimal_params['cost'] if 'cost' in optimal_params else 1,
                         kernel=optimal_params['kernel'] if 'kernel' in optimal_params else 'linear',
                         gamma=optimal_params['gamma'] if 'gamma' in optimal_params else 1,
                         coef0=optimal_params['coef0'] if 'coef0' in optimal_params else 1,
                         degree=optimal_params['degree'] if 'degree' in optimal_params else 1,
                         max_iter=c.max_iter_optimization,
                         decision_function_shape='ovo', 
                         cache_size=c.cache_size,
                         random_state=c.seed
                        )
    
    classifier.fit(result_training['features'], list(result_training['labels']['label']))  
    array=imp.np.array(result_testing['features'])
    if len(result_testing['features'])==1:
        array=array.reshape(1, -1)

    predictions=classifier.predict(array)
    pred_list=list(predictions)
    true_list=list(result_testing['labels']['label'])
    test_result=dict(
        eval=metric_eval(pred_list, true_list, class_labels), 
        params=optimal_params
    )
    test_predictions = imp.pd.concat([
        outer_ids.astype(str),
        imp.pd.DataFrame(pred_list, columns=['pred']), 
        imp.pd.DataFrame(true_list, columns=['true'])], 
        axis=1)
    
    return dict(result=test_result, predictions=test_predictions)

In [10]:
### loading dynamic feature modules ###:

dynamics=dict()
df_inputs=dict()
for feature in c.select_features['dynamic']:
    dynamics[c.features['dynamic'][feature]['type']]=importlib.import_module('dynamic_features.{}'.format(c.features['dynamic'][feature]['type']))
    importlib.reload(dynamics[c.features['dynamic'][feature]['type']])
    df_inputs[feature]=imp.pd.read_csv('{}data/features/{}_feature_{}({}).csv'.format(c.directory['save'],
                                                                                         c.project_name,
                                                                                         c.features['dynamic'][feature]['type'],
                                                                                         feature))
    df_inputs[feature]=dynamics[c.features['dynamic'][feature]['type']].prep_input(df_inputs[feature], c)

parallel: 15 partitions with 15 cores for para_parsing_df
parallel: 15 partitions with 15 cores for para_parsing_df


In [11]:
### Debugging 1 - optimal params ###

# type(max_results[0])
# dict(max_results[0])
# max_result_test=max_results_list[0]
# optimal_params_temp=imp.copy.deepcopy(optimal_params)
# optimal_params_temp['name']=feature
# optimal_params_temp['type']=c.features['dynamic'][feature]['type']
# max_result_test

In [12]:
### Debugging 2 - running the for loop ###

# params=dict(
#     df_features=dfs['features'], 
#     df_labels=dfs['labels'], 
#     df_inputs=df_inputs,
#     dynamics=dynamics
# )

# optimal_model_training_testing(max_result_test, **params)

In [13]:
### running optimal models in parallel ###

optimal_results=imp.Parallel(n_jobs=c.num_cores
                    , prefer="processes"
                    , verbose=3
                   ) (imp.delayed(optimal_model_training_testing)(max_result, 
                                                                  dfs['features'], 
                                                                  dfs['labels'],
                                                                  df_inputs,
                                                                  dynamics
                                                                 ) 
                      for max_result in max_results_list)

[Parallel(n_jobs=15)]: Using backend LokyBackend with 15 concurrent workers.
[Parallel(n_jobs=15)]: Done   3 out of   3 | elapsed:  2.7min finished


In [14]:
### saving the optimal models results ###

optimal_results_df=create_list_of_results([optimal_result['result'] for optimal_result in optimal_results])
index=['accuracy']
index.extend(list(map(str, class_labels)))
max_results_df.index=index
optimal_results_df.index=index
optimal_results_df.to_csv('{}experiments/{}/4_optimal_models_results.csv'.format(c.directory['save'],
                                                                                 util.experiment_name(c)),
                      index=False)
util.display_df(optimal_results_df)

Unnamed: 0,False,True,accuracy,False.1,True.1,accuracy.1,coef0,cost,cv,degree,full_text_LDA_num_topics,full_text_LDA_topic_cutoff_threshold,full_text_no_above,full_text_no_below,gamma,hashtags_community_feature_type,hashtags_community_graph_type,kernel,lda_hashtags_LDA_num_topics,lda_hashtags_LDA_topic_cutoff_threshold,lda_hashtags_no_above,lda_hashtags_no_below,metric
accuracy,0.695988,0.695694,0.695841,0.710666,0.709263,0.709966,10.0,1,all,2.0,20,0,1,1,0.01,weighted,weighted,poly,20,0,1,1,accuracy
False,0.7,0.702312,0.701161,0.710666,0.709263,0.709966,10.0,1,all,2.0,20,0,1,1,0.01,weighted,weighted,poly,20,0,1,1,False
True,0.694387,0.734177,0.715667,0.685251,0.712633,0.699565,1.0,10,all,1.0,20,0,1,1,1.0,weighted,weighted,linear,20,0,1,1,True


In [15]:
### saving the results of optimal models predictions ###

util.mkdir('{}experiments/{}/true_pred_results/'.format(c.directory['save'],
                                                        util.experiment_name(c)))
for optimal_result in optimal_results:
    optimal_result['predictions'].to_csv('{}experiments/{}/true_pred_results/predictions_{}.csv'.format(c.directory['save'],
                                                                                                        util.experiment_name(c),
                                                                                                        optimal_result['result']['params']['metric']
                                                                                                       ),
                                         index=False)

folder /Users/habibkarbasian/Documents/_Courses/_Datasets/_Retweetability/_ILLAE/experiments/ILLAE_final_svm_fffs_age_listed_verified_character_url_media_mentions_reply_hashtag_liwc_community(hashtags)_lda(lda_hashtags)_lda(full_text)/true_pred_results/ has been created.
