In [21]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn
import pandas as pd 
import numpy as np
import math
from sklearn.preprocessing import StandardScaler

## Begin With the End In Mind

In [3]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_memory = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_memory = df.memory_usage().sum() / 1024**2
    if verbose: 
        print(f"Memory usage of dataframe after reduction {end_memory} MB")
        print(f"Reduced by {100 * (start_memory - end_memory) / start_memory} % ")
    return df

In [2]:
train = pd.read_csv("C:/Users/J\Downloads/tabular-playground-series-dec-2021/train.csv")

In [6]:
train = reduce_mem_usage(train)

Memory usage of dataframe after reduction 259.3995361328125 MB
Reduced by 84.82142251275553 % 


In [7]:
train["Cover_Type"].value_counts().sort_values(ascending=False)

2    2262087
1    1468136
3     195712
7      62261
6      11426
4        377
5          1
Name: Cover_Type, dtype: int64

In [8]:
test = pd.read_csv("C:/Users/J\Downloads/tabular-playground-series-dec-2021/test.csv")

In [9]:
test = reduce_mem_usage(test)

Memory usage of dataframe after reduction 63.89630126953125 MB
Reduced by 84.77270261157742 % 


## Data Explore

In [11]:
train.drop(["Cover_Type"], axis=1).describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Id,4000000.0,2000000.0,1154701.0,0.0,999999.75,1999999.5,2999999.25,3999999.0
Elevation,4000000.0,2980.192,289.0482,1773.0,2760.0,2966.0,3217.0,4383.0
Aspect,4000000.0,151.5857,109.9611,-33.0,60.0,123.0,247.0,407.0
Slope,4000000.0,15.09754,8.546731,-3.0,9.0,14.0,20.0,64.0
Horizontal_Distance_To_Hydrology,4000000.0,271.3154,226.5497,-92.0,110.0,213.0,361.0,1602.0
Vertical_Distance_To_Hydrology,4000000.0,51.66262,68.21597,-317.0,4.0,31.0,78.0,647.0
Horizontal_Distance_To_Roadways,4000000.0,1766.642,1315.61,-287.0,822.0,1436.0,2365.0,7666.0
Hillshade_9am,4000000.0,211.8375,30.75996,-4.0,198.0,218.0,233.0,301.0
Hillshade_Noon,4000000.0,221.0614,22.23134,49.0,210.0,224.0,237.0,279.0
Hillshade_3pm,4000000.0,140.8109,43.69864,-53.0,115.0,142.0,169.0,272.0


## Feature Engineering

In [15]:
# https://www.kaggle.com/hamzaghanmi/tps-dec-step-by-step
# generate new features 
cols = [e for e in test.columns if e not in ('Id')]

train['binned_elevation'] = [math.floor(v/50.0) for v in train['Elevation']]
test['binned_elevation'] = [math.floor(v/50.0) for v in test['Elevation']]

train['Horizontal_Distance_To_Roadways_Log'] = [np.log(v+300) for v in train['Horizontal_Distance_To_Roadways']]
test['Horizontal_Distance_To_Roadways_Log'] = [np.log(v+300) for v in test['Horizontal_Distance_To_Roadways']]

train['Soil_Type12_32'] = train['Soil_Type32'] + train['Soil_Type12']
test['Soil_Type12_32'] = test['Soil_Type32'] + test['Soil_Type12']
train['Soil_Type23_22_32_33'] = train['Soil_Type23'] + train['Soil_Type22'] + train['Soil_Type32'] + train['Soil_Type33']
test['Soil_Type23_22_32_33'] = test['Soil_Type23'] + test['Soil_Type22'] + test['Soil_Type32'] + test['Soil_Type33']

cols = [e for e in test.columns if e not in ('Id')]

In [16]:
# delete the sample with target 5
train.drop(train[train['Cover_Type']==5].index,inplace=True)

In [17]:
train["binned_elevation"].nunique()

53

In [22]:
ss = StandardScaler()

In [26]:
train[cols] = ss.fit_transform(train[cols].copy())

In [24]:
test = ss.transform(test[cols].copy())

In [25]:
train.to_csv("C:/Users/J/Desktop/data/kaggle_tps_dec_2021/train_tr1.csv", index=False)

AttributeError: 'numpy.ndarray' object has no attribute 'to_csv'

In [None]:
train.to_csv("C:/Users/J/Desktop/data/kaggle_tps_dec_2021/test_tr1.csv", index=False)

## Reduce To Embedding

In [10]:
# Check if any have more than one soil type
soil_cols = [col for col in list(train.columns.values) if "Soil_Type" in col]
soil_sum = train[soil_cols].sum(axis=1)
soil_sum.head()

0    1
1    1
2    1
3    1
4    1
dtype: int64

In [13]:
soil_sum.value_counts()

0    1655723
1    1562456
2     618915
3     140362
4      20298
5       2050
6        183
7         13
dtype: int64