In [1]:
#Import Libraries
import pandas as pd
import load_data
import datetime
from sklearn.model_selection import train_test_split

pd.set_option('display.max_rows', 999)
pd.set_option('display.max_columns', 999)

In [2]:
class Time: # change class name to Time
    
    """Time stamp for code execution"""
        
    def get_current_time(self):
        now = datetime.datetime.now()
        return(now)
    
    def get_start_time(self):
        # Start time
        start_time = self.get_current_time()
        return(start_time)
       
    def get_end_time(self):
        # End time
        end_time = self.get_current_time()
        elapsed_time = end_time - self.get_start_time()
        return(end_time, elapsed_time)
        
    def print_start(self):
        print('--------Start Script--------')
        print('--------Start Time: ' + self.get_start_time().strftime('%Y-%m-%d %H:%M:%S') + '-------\n')

    def print_end(self):
        print('Total ' + str(self.get_end_time()[1].seconds) + ' [sec]')
        print('-----End Time : ' + self.get_end_time()[0].strftime('%Y-%m-%d %H:%M:%S') + ' ---------')
        print('-----END SCRIPT------') 

In [3]:
def merge_df(left_df, right_df, merge_on, how, suffixes=(None,None)):
        df_merge = left_df.merge(right_df, on=[merge_on], how=how, suffixes=suffixes)
        print(f"Shape of dataframe: {df_merge.shape}")
        return df_merge

In [4]:
def main():
    
    time = Time()
    time.print_start()

    #Load Data
    mypath = "../data/"
    mydata = load_data.get_file_names(mypath)
    df = load_data.load_copy_data(mydata, mypath)
    #df.keys()

    #Check dimension of data
    print(f"We have {df['train_month_1'].shape[0]} rows and {df['train_month_1'].shape[1]} columns in our month 1 training dataset.")
    print(f"We have {df['train_month_2'].shape[0]} rows and {df['train_month_2'].shape[1]} columns in our month 2 training dataset.")
    print(f"We have {df['train_month_3_with_target'].shape[0]} rows and {df['train_month_3_with_target'].shape[1]} columns in month 3 training dataset.")

    #Look for duplicated rows
    print(f"We have {df['train_month_1'].duplicated().sum()} duplicated rows in our month 1 training dataset.")
    print(f"We have {df['train_month_2'].duplicated().sum()} duplicated rows in our month 2 training dataset.")
    print(f"We have {df['train_month_3_with_target'].duplicated().sum()} duplicated rows in our month 3 training dataset.")

    #Look at a few rows..
    #Merge data (should be 39+38+39 columns = 118?, rows = 63697) with client_id as key
    df_merged = merge_df(df['train_month_1'], df['train_month_2'], 'client_id', 'outer', ('_m1', '_m2'))
    df_merged = merge_df(df_merged, df['train_month_3_with_target'], 'client_id', 'outer')

    #Export merged data
    print(f'Export data to {mypath} as data_merged.csv')
    df_merged.to_csv(mypath + 'data_merged.csv', encoding='utf-8', index=False)

    time.print_end()

In [5]:
if __name__ == '__main__':
    main()

--------Start Script--------
--------Start Time: 2022-04-05 15:15:17-------

file name: train_month_1
file name: train_month_2
file name: test_month_1
file name: test_month_3
file name: test_month_2
file name: train_month_3_with_target
We have 63697 rows and 39 columns in our month 1 training dataset.
We have 63697 rows and 39 columns in our month 2 training dataset.
We have 63697 rows and 40 columns in month 3 training dataset.
We have 0 duplicated rows in our month 1 training dataset.
We have 0 duplicated rows in our month 2 training dataset.
We have 0 duplicated rows in our month 3 training dataset.
Shape of dataframe: (63697, 77)
Shape of dataframe: (63697, 116)
Export data to ../data/ as data_merged.csv
Total 86399 [sec]
-----End Time : 2022-04-05 15:15:20 ---------
-----END SCRIPT------
