The data is downloaded from the Kaggle Bluebook for Bulldozers competition: https://www.kaggle.com/c/bluebook-for-bulldozers/data


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

In [4]:
# importing the csv file as df and setting the saledate column datetime
df = pd.read_csv("data/TrainAndValid.csv",
                 low_memory=False, parse_dates=["saledate"])


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 412698 entries, 0 to 412697
Data columns (total 53 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   SalesID                   412698 non-null  int64         
 1   SalePrice                 412698 non-null  float64       
 2   MachineID                 412698 non-null  int64         
 3   ModelID                   412698 non-null  int64         
 4   datasource                412698 non-null  int64         
 5   auctioneerID              392562 non-null  float64       
 6   YearMade                  412698 non-null  int64         
 7   MachineHoursCurrentMeter  147504 non-null  float64       
 8   UsageBand                 73670 non-null   object        
 9   saledate                  412698 non-null  datetime64[ns]
 10  fiModelDesc               412698 non-null  object        
 11  fiBaseModel               412698 non-null  object        
 12  fi

In [6]:
# sorting the df by saledate

df.sort_values(by=['saledate'],inplace=True,ascending=True)

In [7]:
df.head().T

Unnamed: 0,205615,274835,141296,212552,62755
SalesID,1646770,1821514,1505138,1671174,1329056
SalePrice,9500.0,14000.0,50000.0,16000.0,22000.0
MachineID,1126363,1194089,1473654,1327630,1336053
ModelID,8434,10150,4139,8591,4089
datasource,132,132,132,132,132
auctioneerID,18.0,99.0,99.0,99.0,99.0
YearMade,1974,1980,1978,1980,1984
MachineHoursCurrentMeter,,,,,
UsageBand,,,,,
saledate,1989-01-17 00:00:00,1989-01-31 00:00:00,1989-01-31 00:00:00,1989-01-31 00:00:00,1989-01-31 00:00:00


In [8]:
# Making copy of original dataframe so we can manupulate the data without loosing original df
df_copy = df.copy()

In [9]:
# Making new columns based on saledate column
df_copy['saleYear'] = df_copy['saledate'].dt.year
df_copy['saleMonth'] = df_copy['saledate'].dt.month
df_copy['saleDay'] = df_copy['saledate'].dt.day
df_copy['saleDayOfWeek'] = df_copy['saledate'].dt.dayofweek
df_copy['saleDayOfYear'] = df_copy['saledate'].dt.dayofyear

In [10]:
df.head(3).T

Unnamed: 0,205615,274835,141296
SalesID,1646770,1821514,1505138
SalePrice,9500.0,14000.0,50000.0
MachineID,1126363,1194089,1473654
ModelID,8434,10150,4139
datasource,132,132,132
auctioneerID,18.0,99.0,99.0
YearMade,1974,1980,1978
MachineHoursCurrentMeter,,,
UsageBand,,,
saledate,1989-01-17 00:00:00,1989-01-31 00:00:00,1989-01-31 00:00:00


In [11]:
# removing saledate column 
df_copy.drop('saledate', axis=1, inplace=True)

In [12]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 412698 entries, 205615 to 409203
Data columns (total 57 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   SalesID                   412698 non-null  int64  
 1   SalePrice                 412698 non-null  float64
 2   MachineID                 412698 non-null  int64  
 3   ModelID                   412698 non-null  int64  
 4   datasource                412698 non-null  int64  
 5   auctioneerID              392562 non-null  float64
 6   YearMade                  412698 non-null  int64  
 7   MachineHoursCurrentMeter  147504 non-null  float64
 8   UsageBand                 73670 non-null   object 
 9   fiModelDesc               412698 non-null  object 
 10  fiBaseModel               412698 non-null  object 
 11  fiSecondaryDesc           271971 non-null  object 
 12  fiModelSeries             58667 non-null   object 
 13  fiModelDescriptor         74816 non-nul

In [13]:
cp = df_copy.copy()

In [14]:
for column_name, column_data in cp.items():
    if pd.api.types.is_string_dtype(column_data):
        cp[column_name] = column_data.astype('category')


In [15]:
cp.state.cat.codes


205615    43
274835     8
141296     8
212552     8
62755      8
          ..
410879     4
412476     4
411927     4
407124     4
409203     4
Length: 412698, dtype: int8

In [16]:
# Checking and turning column type as category if it contains any string
for column_name, column_data in df_copy.items():
    if pd.api.types.is_string_dtype(column_data):
        df_copy[column_name] = column_data.astype('category').cat.as_ordered()

In [17]:
# 

205615    43
274835     8
141296     8
212552     8
62755      8
          ..
410879     4
412476     4
411927     4
407124     4
409203     4
Length: 412698, dtype: int8

0.0