<a href="https://colab.research.google.com/github/idebroy/ml-ds/blob/main/offline_fail.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# prompt: write a function to load a csv file from google drive

import pandas as pd
from google.colab import drive

def load_csv_from_drive(file_path):
  """Loads a CSV file from Google Drive into a pandas DataFrame.

  Args:
    file_path: The path to the CSV file on Google Drive.

  Returns:
    A pandas DataFrame containing the data from the CSV file.
  """
  drive.mount('/content/drive', force_remount=True)

  try:
    df = pd.read_csv(file_path)
    return df
  except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
    return None
  except Exception as e:
    print(f"An error occurred while loading the file: {e}")
    return None

# Example usage:
# Assuming your CSV file is located at '/content/drive/My Drive/my_data.csv'
testFilePath = '/content/drive/My Drive/Colab-Stuff/offline-fail.csv'
df = load_csv_from_drive(testFilePath)
print(f"Drive \"connected to:\" {testFilePath}")

# If the file is loaded successfully, you can work with the DataFrame 'df'.
# For example, you can print the first few rows:
print(df.head())

Mounted at /content/drive
Drive "connected to:" /content/drive/My Drive/Colab-Stuff/offline-fail.csv
           id           uuid  order_id  customer_id  customer_uuid  \
0  3920481938  8SYQXGQZAX75P    994395          NaN            NaN   
1  3920481963  SY8ZX7YNZ82P4    994433          NaN            NaN   
2  3920482443  C26H3KW534N2E    994971          NaN            NaN   
3  3920492858  ASJF540XTPASP   1026537          NaN            NaN   
4  3920493555  HPMGXNTE97NCM   1027587          NaN            NaN   

   merchant_id  merchant_tender_id  amount  tip_amount  tax_amount  ...  \
0        14005              113167      55         NaN          10  ...   
1        14005              113167    2170         NaN         395  ...   
2        14005              113167    1136         NaN         207  ...   
3         8277               66813    1100         0.0         100  ...   
4        12617              101703     452         NaN           0  ...   

   payment_refund_id  exter

In [None]:
def impute_categorical_nan(data, column):
  """
  Imputes missing values in a categorical column with the most frequent value.

  Args:
    data: Pandas DataFrame.
    column: Name of the categorical column with missing values.

  Returns:
    Pandas DataFrame with imputed values.
  """

  # Find the most frequent value
  most_frequent_value = data[column].mode()[0]
  print(most_frequent_value)

  # Fill missing values with the most frequent value
  data[column] = data[column].fillna(most_frequent_value)

  return data

def impute_numerical_nan(data, column):
  """
  Imputes missing values in a numerical column with the mean.
  Args:
    data: Pandas DataFrame.
    column: Name of the numerical column with missing values.
  Returns:
    Pandas DataFrame with imputed values.
  """
  # Calculate the mean
  mean = data[column].mean()

  # Fill missing values with the mean
  data[column] = data[column].fillna(mean)

  return data

def impute_nan(data, column):
  """
  Imputes missing values in a column based on its data type.

  Args:
    data: Pandas DataFrame.
    column: Name of the column with missing values.

  Returns:
    Pandas DataFrame with imputed values.
  """
  if column in data.select_dtypes(include=['number']).columns:
    data = impute_numerical_nan(data, column)
  else:
    data = impute_categorical_nan(data, column)

  return data

def find_cat_correlation(data, cat_cols, target_col):
  """
  Finds the categorical column most highly correlated (using Cramér's V) with the target column.

  Args:
      data: Pandas DataFrame.
      cat_cols: List of categorical columns.
      target_col: Name of the target column (must be categorical).

  Returns:
      Name of the most highly correlated categorical column.
  """
  from scipy.stats import chi2_contingency

  correlations = {}

  for col in cat_cols:
    if col != target_col:
      contingency_table = data.pivot_table(index=target_col, columns=col, aggfunc='size', fill_value=0)
      chi2, _, _, _ = chi2_contingency(contingency_table)
      n = contingency_table.sum().sum()
      phi2 = chi2 / n
      min_dim = min(contingency_table.shape) - 1
      cramers_v = np.sqrt(phi2 / min_dim)
      correlations[col] = cramers_v

  most_correlated_column = max(correlations, key=correlations.get)
  return most_correlated_column

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1975 entries, 0 to 1974
Data columns (total 24 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   1975 non-null   int64  
 1   uuid                 1975 non-null   object 
 2   order_id             1975 non-null   int64  
 3   customer_id          0 non-null      float64
 4   customer_uuid        0 non-null      float64
 5   merchant_id          1975 non-null   int64  
 6   merchant_tender_id   1975 non-null   int64  
 7   amount               1975 non-null   int64  
 8   tip_amount           1428 non-null   float64
 9   tax_amount           1975 non-null   int64  
 10  cashback_amount      1585 non-null   float64
 11  cash_tendered        0 non-null      float64
 12  gateway_tx_id        1975 non-null   int64  
 13  tip_gateway_tx_id    2 non-null      float64
 14  payment_refund_id    0 non-null      float64
 15  external_payment_id  120 non-null    o

## Compute missing values

In [None]:
# Calculate the number of missing values in each column
missing_values_count = df.isnull().sum()

# Display the missing values count
print("Missing values per column:")
print(missing_values_count)

Missing values per column:
id                        0
uuid                      0
order_id                  0
customer_id            1975
customer_uuid          1975
merchant_id               0
merchant_tender_id        0
amount                    0
tip_amount              547
tax_amount                0
cashback_amount         390
cash_tendered          1975
gateway_tx_id             0
tip_gateway_tx_id      1973
payment_refund_id      1975
external_payment_id    1855
account_id                2
device_id               408
note                   1805
created_time              0
client_created_time       0
modified_time             0
offline                   0
result                    0
dtype: int64


In [None]:
# Identify columns with more than 500 missing values
columns_to_drop = missing_values_count[missing_values_count > 500].index.tolist()

# Filter columns_to_drop to only include those present in the current DataFrame
existing_columns_to_drop = [col for col in columns_to_drop if col in df.columns]

# Drop the identified columns that exist in the DataFrame
df = df.drop(columns=existing_columns_to_drop)

# Display the updated DataFrame info to confirm the columns are dropped
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1975 entries, 0 to 1974
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   1975 non-null   int64  
 1   uuid                 1975 non-null   object 
 2   order_id             1975 non-null   int64  
 3   merchant_id          1975 non-null   int64  
 4   merchant_tender_id   1975 non-null   int64  
 5   amount               1975 non-null   int64  
 6   tax_amount           1975 non-null   int64  
 7   cashback_amount      1585 non-null   float64
 8   gateway_tx_id        1975 non-null   int64  
 9   account_id           1973 non-null   float64
 10  device_id            1567 non-null   float64
 11  created_time         1975 non-null   object 
 12  client_created_time  1975 non-null   object 
 13  modified_time        1975 non-null   object 
 14  offline              1975 non-null   int64  
 15  result               1975 non-null   o

In [None]:
df = impute_categorical_nan(df, 'cashback_amount')
columns = ['amount']
res = find_cat_correlation(df, columns, 'amount')

0.0


ValueError: max() arg is an empty sequence