In [None]:
# Importing Libraries
from google.cloud import storage
from io import StringIO
import pandas as pd

# Source for the files
source_bucket_name = "my-bigdata-project-kg"

# Create a client object that points to GCS
storage_client = storage.Client()

# Define the folder pattern (your prefix)
folder_pattern = "landing/"

# Get a list of the blobs (objects or files) in the bucket
blobs = storage_client.list_blobs(source_bucket_name, prefix=folder_pattern)

# Filter for .csv files
filtered_blobs = [blob for blob in blobs if blob.name.endswith('.csv')]

# Print the number of filtered blobs
print(f"Found {len(filtered_blobs)} CSV files.")

# Column names and data types for reading CSV files
column_names = ['datetime', 'gamemode', 'player1_tag', 'player1_trophies',
                'player1_crowns', 'player1_card1', 'player1_card2',
                'player1_card3', 'player1_card4', 'player1_card5',
                'player1_card6', 'player1_card7', 'player1_card8',
                'player2_tag', 'player2_trophies', 'player2_crowns',
                'player2_card1', 'player2_card2', 'player2_card3',
                'player2_card4', 'player2_card5', 'player2_card6',
                'player2_card7', 'player2_card8']
                
data_types = {'datetime': 'string', 'gamemode': 'int64', 'player1_tag': 'string',
              'player1_trophies': 'int32', 'player1_crowns': 'int32',
              'player1_card1': 'int64', 'player1_card2': 'int64',
              'player1_card3': 'int64', 'player1_card4': 'int64',
              'player1_card5': 'int64', 'player1_card6': 'int64',
              'player1_card7': 'int64', 'player1_card8': 'int64',
              'player2_tag': 'string', 'player2_trophies': 'int32',
              'player2_crowns': 'int32', 'player2_card1': 'int64',
              'player2_card2': 'int64', 'player2_card3': 'int64',
              'player2_card4': 'int64', 'player2_card5': 'int64',
              'player2_card6': 'int64', 'player2_card7': 'int64',
              'player2_card8': 'int64'}

# Define the EDA function
def perform_eda(df): 
    print("Starting EDA...")
    if df.empty:
        print("DataFrame is empty. No EDA to perform.")
        return
    
    # Number of observations
    num_observations = df.shape[0]
    print(f"Number of observations: {num_observations}") 
    
    # Number of missing fields 
    missing_values = df.isnull().sum() 
    print("Number of missing values in each field:")
    print(missing_values[missing_values > 0]) 

    # Summary statistics for numeric variables 
    numeric_summary = df.describe(include='number') 
    print("Summary statistics for numeric variables:") 
    print(numeric_summary) 

    # Summary for date variables 
    date_columns = df.select_dtypes(include=['datetime', 'datetime64']).columns 
    if date_columns.size > 0: 
        for date_col in date_columns: 
            min_date = df[date_col].min() 
            max_date = df[date_col].max() 
            print(f"Min date for {date_col}: {min_date}") 
            print(f"Max date for {date_col}: {max_date}") 
    else: 
        print("No date variables found.")

# Iterate through the list of filtered blobs
for blob in filtered_blobs:
    print(f"Processing file: {blob.name} with size {blob.size} bytes")
    
    # Read the CSV file with specified column names and data types
    df = pd.read_csv(StringIO(blob.download_as_text()), names=column_names, dtype=data_types)
    
    # Convert the datetime column to an actual datetime data type
    df['gametime'] = pd.to_datetime(df['datetime'], format='%Y%m%dT%H%M%S.%fZ')
    
    # Call your function to do the EDA
    perform_eda(df)

Found 408 CSV files.
Processing file: landing/20221004-20221107/20221004-20221107/20221004.csv with size 39151 bytes
Starting EDA...
Number of observations: 188
Number of missing values in each field:
Series([], dtype: int64)
Summary statistics for numeric variables:
           gamemode  player1_trophies  player1_crowns  player1_card1  \
count  1.880000e+02        188.000000      188.000000   1.880000e+02   
mean   7.200009e+07       5881.952128        1.021277   2.600001e+07   
std    1.304033e+02        482.456596        1.013056   1.025888e+01   
min    7.200001e+07       5000.000000        0.000000   2.600000e+07   
25%    7.200001e+07       5489.000000        0.000000   2.600001e+07   
50%    7.200001e+07       5949.500000        1.000000   2.600001e+07   
75%    7.200029e+07       6125.500000        1.000000   2.600002e+07   
max    7.200029e+07       6863.000000        3.000000   2.600005e+07   

       player1_card2  player1_card3  player1_card4  player1_card5  \
count   1.8800

Summary statistics for numeric variables:
           gamemode  player1_trophies  player1_crowns  player1_card1  \
count  2.140000e+02        214.000000      214.000000   2.140000e+02   
mean   7.200004e+07       5700.588785        0.873832   2.600001e+07   
std    6.394528e+01        905.806154        1.010725   7.495542e+00   
min    7.200001e+07          7.000000        0.000000   2.600000e+07   
25%    7.200001e+07       5390.750000        0.000000   2.600000e+07   
50%    7.200004e+07       5853.000000        1.000000   2.600001e+07   
75%    7.200004e+07       6092.500000        1.000000   2.600001e+07   
max    7.200029e+07       6841.000000        3.000000   2.600005e+07   

       player1_card2  player1_card3  player1_card4  player1_card5  \
count   2.140000e+02   2.140000e+02   2.140000e+02   2.140000e+02   
mean    2.600002e+07   2.600003e+07   2.607481e+07   2.626640e+07   
std     1.153827e+01   1.543150e+01   2.808663e+05   5.642465e+05   
min     2.600000e+07   2.600000e+

Starting EDA...
Number of observations: 277
Number of missing values in each field:
Series([], dtype: int64)
Summary statistics for numeric variables:
           gamemode  player1_trophies  player1_crowns  player1_card1  \
count  2.770000e+02        277.000000      277.000000   2.770000e+02   
mean   7.200005e+07       6025.212996        0.848375   2.600001e+07   
std    1.043110e+02        715.562266        0.931789   8.710755e+00   
min    7.200001e+07        235.000000        0.000000   2.600000e+07   
25%    7.200001e+07       5637.000000        0.000000   2.600001e+07   
50%    7.200001e+07       6029.000000        1.000000   2.600001e+07   
75%    7.200001e+07       6490.000000        1.000000   2.600001e+07   
max    7.200029e+07       6950.000000        3.000000   2.600005e+07   

       player1_card2  player1_card3  player1_card4  player1_card5  \
count   2.770000e+02   2.770000e+02   2.770000e+02   2.770000e+02   
mean    2.600003e+07   2.600004e+07   2.609390e+07   2.633578e

Starting EDA...
Number of observations: 303
Number of missing values in each field:
Series([], dtype: int64)
Summary statistics for numeric variables:
           gamemode  player1_trophies  player1_crowns  player1_card1  \
count  3.030000e+02        303.000000      303.000000   3.030000e+02   
mean   7.200009e+07       5999.891089        0.864686   2.600001e+07   
std    1.020802e+02        635.209861        0.941065   9.077013e+00   
min    7.200001e+07       3400.000000        0.000000   2.600000e+07   
25%    7.200004e+07       5452.500000        0.000000   2.600000e+07   
50%    7.200004e+07       5992.000000        1.000000   2.600001e+07   
75%    7.200004e+07       6285.000000        1.000000   2.600002e+07   
max    7.200029e+07       7264.000000        3.000000   2.600005e+07   

       player1_card2  player1_card3  player1_card4  player1_card5  \
count   3.030000e+02   3.030000e+02   3.030000e+02   3.030000e+02   
mean    2.600002e+07   2.600004e+07   2.605285e+07   2.630037e

Starting EDA...
Number of observations: 402
Number of missing values in each field:
Series([], dtype: int64)
Summary statistics for numeric variables:
           gamemode  player1_trophies  player1_crowns  player1_card1  \
count  4.020000e+02        402.000000      402.000000   4.020000e+02   
mean   7.200002e+07       6161.482587        0.845771   2.600001e+07   
std    5.247346e+01        594.420660        0.958539   8.869230e+00   
min    7.200001e+07       4054.000000        0.000000   2.600000e+07   
25%    7.200001e+07       5704.750000        0.000000   2.600000e+07   
50%    7.200001e+07       6183.000000        1.000000   2.600001e+07   
75%    7.200001e+07       6710.250000        1.000000   2.600001e+07   
max    7.200029e+07       7363.000000        3.000000   2.600006e+07   

       player1_card2  player1_card3  player1_card4  player1_card5  \
count   4.020000e+02   4.020000e+02   4.020000e+02   4.020000e+02   
mean    2.600002e+07   2.600252e+07   2.603984e+07   2.628362e

Starting EDA...
Number of observations: 637
Number of missing values in each field:
Series([], dtype: int64)
Summary statistics for numeric variables:
           gamemode  player1_trophies  player1_crowns  player1_card1  \
count  6.370000e+02        637.000000      637.000000   6.370000e+02   
mean   7.200020e+07       6107.875981        0.924647   2.600001e+07   
std    5.126507e+01       1060.212496        0.976439   1.043278e+01   
min    7.200001e+07          2.000000        0.000000   2.600000e+07   
25%    7.200020e+07       5803.000000        0.000000   2.600001e+07   
50%    7.200020e+07       6256.000000        1.000000   2.600001e+07   
75%    7.200020e+07       6604.000000        1.000000   2.600002e+07   
max    7.200029e+07       7643.000000        3.000000   2.600005e+07   

       player1_card2  player1_card3  player1_card4  player1_card5  \
count   6.370000e+02   6.370000e+02   6.370000e+02   6.370000e+02   
mean    2.600002e+07   2.600003e+07   2.602359e+07   2.622924e

Starting EDA...
Number of observations: 1188
Number of missing values in each field:
Series([], dtype: int64)
Summary statistics for numeric variables:
           gamemode  player1_trophies  player1_crowns  player1_card1  \
count  1.188000e+03       1188.000000     1188.000000   1.188000e+03   
mean   7.200013e+07       4129.002525        0.904040   2.600001e+07   
std    1.537142e+02       3115.475674        0.989013   1.153539e+01   
min    7.200001e+07          0.000000        0.000000   2.600000e+07   
25%    7.200001e+07         30.000000        0.000000   2.600000e+07   
50%    7.200001e+07       6494.500000        1.000000   2.600001e+07   
75%    7.200032e+07       6574.000000        1.000000   2.600001e+07   
max    7.200032e+07       6853.000000        3.000000   2.600006e+07   

       player1_card2  player1_card3  player1_card4  player1_card5  \
count   1.188000e+03   1.188000e+03   1.188000e+03   1.188000e+03   
mean    2.600002e+07   2.600088e+07   2.604045e+07   2.628540

Starting EDA...
Number of observations: 1649
Number of missing values in each field:
Series([], dtype: int64)
Summary statistics for numeric variables:
           gamemode  player1_trophies  player1_crowns  player1_card1  \
count  1.649000e+03       1649.000000     1649.000000   1.649000e+03   
mean   7.200016e+07       3584.838690        0.896301   2.600001e+07   
std    1.524925e+02       3207.756959        1.005829   1.326270e+01   
min    7.200001e+07          0.000000        0.000000   2.600000e+07   
25%    7.200001e+07         30.000000        0.000000   2.600000e+07   
50%    7.200004e+07       6355.000000        1.000000   2.600001e+07   
75%    7.200032e+07       6553.000000        1.000000   2.600002e+07   
max    7.200033e+07       7102.000000        3.000000   2.600008e+07   

       player1_card2  player1_card3  player1_card4  player1_card5  \
count   1.649000e+03   1.649000e+03   1.649000e+03   1.649000e+03   
mean    2.600003e+07   2.600064e+07   2.607160e+07   2.636390

Starting EDA...
Number of observations: 4193
Number of missing values in each field:
Series([], dtype: int64)
Summary statistics for numeric variables:
           gamemode  player1_trophies  player1_crowns  player1_card1  \
count  4.193000e+03       4193.000000     4193.000000   4.193000e+03   
mean   7.200022e+07       3819.896971        0.886954   2.600001e+07   
std    1.158661e+02       3115.164365        0.949640   1.266066e+01   
min    7.200001e+07          0.000000        0.000000   2.600000e+07   
25%    7.200020e+07         30.000000        0.000000   2.600000e+07   
50%    7.200020e+07       6336.000000        1.000000   2.600001e+07   
75%    7.200033e+07       6578.000000        1.000000   2.600002e+07   
max    7.200033e+07       7500.000000        3.000000   2.600008e+07   

       player1_card2  player1_card3  player1_card4  player1_card5  \
count   4.193000e+03   4.193000e+03   4.193000e+03   4.193000e+03   
mean    2.600003e+07   2.600099e+07   2.606110e+07   2.633298

Starting EDA...
Number of observations: 14168
Number of missing values in each field:
Series([], dtype: int64)
Summary statistics for numeric variables:
           gamemode  player1_trophies  player1_crowns  player1_card1  \
count  1.416800e+04      14168.000000    14168.000000   1.416800e+04   
mean   7.200020e+07       3507.728755        0.869001   2.600001e+07   
std    1.398096e+02       3157.694029        0.947215   1.271077e+01   
min    7.200001e+07        -31.000000        0.000000   2.600000e+07   
25%    7.200001e+07         30.000000        0.000000   2.600000e+07   
50%    7.200029e+07       5424.000000        1.000000   2.600001e+07   
75%    7.200032e+07       6567.000000        1.000000   2.600002e+07   
max    7.200033e+07       7500.000000        3.000000   2.600008e+07   

       player1_card2  player1_card3  player1_card4  player1_card5  \
count   1.416800e+04   1.416800e+04   1.416800e+04   1.416800e+04   
mean    2.600010e+07   2.600053e+07   2.605771e+07   2.63245

Starting EDA...
Number of observations: 40639
Number of missing values in each field:
Series([], dtype: int64)
Summary statistics for numeric variables:
           gamemode  player1_trophies  player1_crowns  player1_card1  \
count  4.063900e+04      40639.000000    40639.000000   4.063900e+04   
mean   7.200012e+07       4502.915943        0.862669   2.600001e+07   
std    1.494940e+02       3053.330867        0.942729   1.267247e+01   
min    7.200001e+07          0.000000        0.000000   2.600000e+07   
25%    7.200001e+07         30.000000        0.000000   2.600000e+07   
50%    7.200001e+07       6500.000000        1.000000   2.600001e+07   
75%    7.200032e+07       6621.000000        1.000000   2.600002e+07   
max    7.200032e+07       7500.000000        3.000000   2.600008e+07   

       player1_card2  player1_card3  player1_card4  player1_card5  \
count   4.063900e+04   4.063900e+04   4.063900e+04   4.063900e+04   
mean    2.600010e+07   2.600080e+07   2.605918e+07   2.63288

Starting EDA...
Number of observations: 476914
Number of missing values in each field:
Series([], dtype: int64)
Summary statistics for numeric variables:
           gamemode  player1_trophies  player1_crowns  player1_card1  \
count  4.769140e+05     476914.000000   476914.000000   4.769140e+05   
mean   7.200020e+07       4233.585988        0.845593   2.600008e+07   
std    1.338728e+02       3069.388780        0.929435   1.149318e+04   
min    7.200001e+07          0.000000        0.000000   2.600000e+07   
25%    7.200004e+07         30.000000        0.000000   2.600000e+07   
50%    7.200029e+07       6397.000000        1.000000   2.600001e+07   
75%    7.200033e+07       6596.000000        1.000000   2.600002e+07   
max    7.200033e+07       7500.000000        3.000000   2.800000e+07   

       player1_card2  player1_card3  player1_card4  player1_card5  \
count   4.769140e+05   4.769140e+05   4.769140e+05   4.769140e+05   
mean    2.600018e+07   2.600097e+07   2.605867e+07   2.6330

Starting EDA...
Number of observations: 1685888
Number of missing values in each field:
Series([], dtype: int64)
Summary statistics for numeric variables:
           gamemode  player1_trophies  player1_crowns  player1_card1  \
count  1.685888e+06      1.685888e+06    1.685888e+06   1.685888e+06   
mean   7.200021e+07      3.212175e+03    8.682771e-01   2.600009e+07   
std    1.450175e+02      3.222912e+03    9.426297e-01   1.183121e+04   
min    7.200001e+07      0.000000e+00    0.000000e+00   2.600000e+07   
25%    7.200001e+07      0.000000e+00    0.000000e+00   2.600000e+07   
50%    7.200032e+07      1.708000e+03    1.000000e+00   2.600001e+07   
75%    7.200032e+07      6.559000e+03    1.000000e+00   2.600002e+07   
max    7.200032e+07      7.500000e+03    3.000000e+00   2.800000e+07   

       player1_card2  player1_card3  player1_card4  player1_card5  \
count   1.685888e+06   1.685888e+06   1.685888e+06   1.685888e+06   
mean    2.600018e+07   2.600095e+07   2.605569e+07   2.633