In [28]:
import pandas as pd
import numpy as np
from pandas.core.interchange.from_dataframe import categorical_column_to_series

In [6]:
chunk_size = 100000

In [5]:
# Initialize variables for analysis
total_rows = 0
column_sums = None
column_squared_sums = None

In [6]:
# Step 3: Read the data in chunks
def read_csv_in_chunks(file_path, chunk_size):
    for chunk in pd.read_csv(file_path, chunksize=chunk_size):
        yield chunk

In [8]:
# Step 4: Sample 20% of observation
labels = pd.read_csv('data/train_labels.csv')
sample_labels = labels.sample(frac=0.2, random_state=42)

In [10]:
# Read data
sampled_data = []
for chunk in read_csv_in_chunks('data/train_data.csv', chunk_size):
    merged_chunk = pd.merge(chunk, sample_labels, on='customer_ID', how='inner')
    sampled_data.append(merged_chunk)
    
# Combine all chunks into a single dataframe
development_sample = pd.concat(sampled_data, ignore_index=True)

# Save the development sample
development_sample.to_csv('data/development_sample.csv', index=False)

In [8]:
# load development_sample
data = []
for chunk in read_csv_in_chunks('data/development_sample.csv', chunk_size):
    data.append(chunk)

    

In [13]:
df = pd.concat(data, ignore_index=True)
df.head()

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,target
0,000098081fde4fd64bc4d503a5d6f86a0aedc425c96f52...,2017-03-11,0.374606,0.033519,0.044293,1.008622,0.00147,0.459235,0.002339,0.006168,...,,,0.008263,0.006609,0.00737,,0.007171,0.00512,0.007513,0
1,000098081fde4fd64bc4d503a5d6f86a0aedc425c96f52...,2017-04-22,0.414269,0.002516,0.059667,0.123964,0.004374,0.434148,0.001405,0.05213,...,,,0.001986,0.00405,0.000796,,0.001802,0.002364,0.003987,0
2,000098081fde4fd64bc4d503a5d6f86a0aedc425c96f52...,2017-05-12,0.41331,0.003285,0.053418,0.304955,0.002316,0.415906,0.009388,0.04878,...,,,0.009515,0.008757,0.009219,,0.003134,0.001686,0.001265,0
3,000098081fde4fd64bc4d503a5d6f86a0aedc425c96f52...,2017-06-10,0.328983,0.038574,0.049463,0.115654,0.004654,0.416112,0.003223,0.081001,...,,,0.002524,0.007841,0.007421,,0.000728,0.003591,0.007998,0
4,000098081fde4fd64bc4d503a5d6f86a0aedc425c96f52...,2017-07-19,0.496989,0.005552,0.041452,0.133631,0.007363,0.419864,0.003393,0.098308,...,,,0.003823,0.009599,0.006957,,0.008746,0.007101,0.006658,0


In [26]:
# Step 5: One-hot encoding for categorical variables
# List of known categorical columns
# List of categorical columns
categorical_columns = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

# Function to read and process data in chunks
def process_chunks(file_path, chunk_size=100000):
    encoded_chunks = []
    
    for chunk in pd.read_csv(file_path, chunksize=chunk_size):
        # Convert all specified columns to strings
        for col in categorical_columns:
            chunk[col] = chunk[col].astype(str)
        
        # Perform one-hot encoding
        encoded_chunk = pd.get_dummies(chunk, columns=categorical_columns)
        
        # Ensure the encoded columns are of type int (0 or 1)
        for col in encoded_chunk.columns:
            if col.startswith(tuple(categorical_columns)):
                encoded_chunk[col] = encoded_chunk[col].astype(int)
        
        encoded_chunks.append(encoded_chunk)
        
        print(f"Processed chunk of size {len(chunk)}")
    
    return pd.concat(encoded_chunks, ignore_index=True)

# Process the file
df_encoded = process_chunks('data/development_sample.csv')

print("One-hot encoding completed.")
print(f"Shape of encoded DataFrame: {df_encoded.shape}")

# Display the first few rows of the encoded DataFrame
print(df_encoded.head())

# Check the data types of the encoded columns
print(df_encoded.dtypes)

Processed chunk of size 100000
Processed chunk of size 100000
Processed chunk of size 100000
Processed chunk of size 100000
Processed chunk of size 100000
Processed chunk of size 100000
Processed chunk of size 100000
Processed chunk of size 100000
Processed chunk of size 100000
Processed chunk of size 100000
Processed chunk of size 100000
Processed chunk of size 7082
One-hot encoding completed.
Shape of encoded DataFrame: (1107082, 235)
                                         customer_ID         S_2       P_2  \
0  000098081fde4fd64bc4d503a5d6f86a0aedc425c96f52...  2017-03-11  0.374606   
1  000098081fde4fd64bc4d503a5d6f86a0aedc425c96f52...  2017-04-22  0.414269   
2  000098081fde4fd64bc4d503a5d6f86a0aedc425c96f52...  2017-05-12  0.413310   
3  000098081fde4fd64bc4d503a5d6f86a0aedc425c96f52...  2017-06-10  0.328983   
4  000098081fde4fd64bc4d503a5d6f86a0aedc425c96f52...  2017-07-19  0.496989   

       D_39       B_1       B_2       R_1       S_3      D_41       B_3  ...  \
0  0.03351

In [27]:
df_encoded.to_csv('data/train_encoded_data.csv', index=False)

In [31]:
# Step 6: EDA
df = pd.read_csv('data/train_encoded_data.csv')


In [35]:
df['S_2'] = pd.to_datetime(df['S_2'])

In [36]:
df.head()

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_66_1.0,D_66_nan,D_68_0.0,D_68_1.0,D_68_2.0,D_68_3.0,D_68_4.0,D_68_5.0,D_68_6.0,D_68_nan
0,000098081fde4fd64bc4d503a5d6f86a0aedc425c96f52...,2017-03-11,0.374606,0.033519,0.044293,1.008622,0.00147,0.459235,0.002339,0.006168,...,0,1,0,0,0,0,0,0,0,1
1,000098081fde4fd64bc4d503a5d6f86a0aedc425c96f52...,2017-04-22,0.414269,0.002516,0.059667,0.123964,0.004374,0.434148,0.001405,0.05213,...,0,1,0,0,0,0,0,0,0,1
2,000098081fde4fd64bc4d503a5d6f86a0aedc425c96f52...,2017-05-12,0.41331,0.003285,0.053418,0.304955,0.002316,0.415906,0.009388,0.04878,...,0,1,0,0,1,0,0,0,0,0
3,000098081fde4fd64bc4d503a5d6f86a0aedc425c96f52...,2017-06-10,0.328983,0.038574,0.049463,0.115654,0.004654,0.416112,0.003223,0.081001,...,0,1,0,0,1,0,0,0,0,0
4,000098081fde4fd64bc4d503a5d6f86a0aedc425c96f52...,2017-07-19,0.496989,0.005552,0.041452,0.133631,0.007363,0.419864,0.003393,0.098308,...,0,1,0,0,1,0,0,0,0,0


In [38]:
# calculate last 6 months average value for numerical columns
def calculate_average_last_6_months(df):
    end_date = df['S_2'].max()
    start_date = end_date - pd.DateOffset(months=6)
    
    # Filter data for the last 6 months
    last_6_months_data = df[(df['S_2'] >= start_date) & (df['S_2'] <= end_date)]
    
    # Calculate the average for each numerical column
    averages = last_6_months_data.mean(numeric_only=True)
    return averages

def calculate_average_last_12_months(df):
    end_date = df['S_2'].max()
    start_date = end_date - pd.DateOffset(months=12)
    last_12_months_data = df[df['S_2'] >= start_date]
    averages = last_12_months_data.mean(numeric_only=True)
    return averages

def calculate_min_last_6_months(df):
    end_date = df['S_2'].max()
    start_date = end_date - pd.DateOffset(months=6)
    last_6_months_data = df[(df['S_2'] >= start_date) & (df['S_2'] <= end_date)]
    minimums = last_6_months_data.min(numeric_only=True)
    return minimums

def calculate_max_last_9_months(df):
    end_date = df['S_2'].max()
    start_date = end_date - pd.DateOffset(months=9)
    last_9_months_data = df[(df['S_2'] >= start_date) & (df['S_2'] <= end_date)]
    maximums = last_9_months_data.max(numeric_only=True)
    return maximums

In [41]:
eda_df = df.copy()

In [43]:
# Remove one-hot encoding variables
drop_prefixes = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

regex_pattern = '^(' + '|'.join(drop_prefixes) + ')'

eda_df = eda_df.drop(columns=eda_df.filter(regex=regex_pattern).columns)

In [44]:
average_values = calculate_average_last_6_months(df)

new_column_names = {col: f"{col}_Ave_6" for col in average_values.index if col!= 'customer_ID'}

average_values = average_values.rename(index=new_column_names)

average_values

P_2_Ave_6         0.645674
D_39_Ave_6        0.167517
B_1_Ave_6         0.132256
B_2_Ave_6         0.604573
R_1_Ave_6         0.093123
                    ...   
D_68_3.0_Ave_6    0.088882
D_68_4.0_Ave_6    0.088990
D_68_5.0_Ave_6    0.223858
D_68_6.0_Ave_6    0.498356
D_68_nan_Ave_6    0.036730
Length: 233, dtype: float64