In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

In [5]:
train.columns

Index(['id', 'class', 'cap-diameter', 'cap-shape', 'cap-surface', 'cap-color',
       'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color',
       'stem-height', 'stem-width', 'stem-root', 'stem-surface', 'stem-color',
       'veil-type', 'veil-color', 'has-ring', 'ring-type', 'spore-print-color',
       'habitat', 'season'],
      dtype='object')

In [6]:
test.columns

Index(['id', 'cap-diameter', 'cap-shape', 'cap-surface', 'cap-color',
       'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color',
       'stem-height', 'stem-width', 'stem-root', 'stem-surface', 'stem-color',
       'veil-type', 'veil-color', 'has-ring', 'ring-type', 'spore-print-color',
       'habitat', 'season'],
      dtype='object')

In [8]:
# Returns the number of rows and columns in a DataFrame
print(test.shape,"  ",train.shape)

(2077964, 21)    (3116945, 22)


In [9]:
# Returns the data types of each column in a DataFrame
print(train.dtypes)

id                         int64
class                     object
cap-diameter             float64
cap-shape               category
cap-surface             category
cap-color               category
does-bruise-or-bleed    category
gill-attachment         category
gill-spacing            category
gill-color              category
stem-height              float64
stem-width               float64
stem-root               category
stem-surface            category
stem-color              category
veil-type               category
veil-color              category
has-ring                category
ring-type               category
spore-print-color       category
habitat                 category
season                  category
dtype: object


In [10]:
# Provides a summary of the DataFrame, including column names, data types, non-null values, and memory usage.
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3116945 entries, 0 to 3116944
Data columns (total 22 columns):
 #   Column                Dtype   
---  ------                -----   
 0   id                    int64   
 1   class                 object  
 2   cap-diameter          float64 
 3   cap-shape             category
 4   cap-surface           category
 5   cap-color             category
 6   does-bruise-or-bleed  category
 7   gill-attachment       category
 8   gill-spacing          category
 9   gill-color            category
 10  stem-height           float64 
 11  stem-width            float64 
 12  stem-root             category
 13  stem-surface          category
 14  stem-color            category
 15  veil-type             category
 16  veil-color            category
 17  has-ring              category
 18  ring-type             category
 19  spore-print-color     category
 20  habitat               category
 21  season                category
dtypes: category(17), f

In [12]:
train.describe()

Unnamed: 0,id,cap-diameter,stem-height,stem-width
count,3116945.0,3116941.0,3116945.0,3116945.0
mean,1558472.0,6.309848,6.348333,11.15379
std,899784.7,4.657931,2.699755,8.095477
min,0.0,0.03,0.0,0.0
25%,779236.0,3.32,4.67,4.97
50%,1558472.0,5.75,5.88,9.65
75%,2337708.0,8.24,7.41,15.63
max,3116944.0,80.67,88.72,102.9


In [11]:
train.isnull().sum()

id                      0
class                   0
cap-diameter            4
cap-shape               0
cap-surface             0
cap-color               0
does-bruise-or-bleed    0
gill-attachment         0
gill-spacing            0
gill-color              0
stem-height             0
stem-width              0
stem-root               0
stem-surface            0
stem-color              0
veil-type               0
veil-color              0
has-ring                0
ring-type               0
spore-print-color       0
habitat                 0
season                  0
dtype: int64

In [3]:
def cleaning(df, threshold=100):
    cat_feats = ["cap-shape", "cap-surface", "cap-color", "does-bruise-or-bleed", "gill-attachment",
                 "gill-spacing", "gill-color", "stem-root", "stem-surface", "stem-color", "veil-type",
                 "veil-color", "has-ring", "ring-type", "spore-print-color", "habitat", "season"]

    for feat in cat_feats:
        # Fill missing values with 'missing'
        df[feat] = df[feat].fillna('missing')
        
        # Replace categories with low counts with 'noise'
        counts = df[feat].value_counts(dropna=False)
        low_count_cats = counts[counts < threshold].index
        if len(low_count_cats) > 0:
            df.loc[df[feat].isin(low_count_cats), feat] = "noise"
        
        # Convert column to categorical type
        df[feat] = df[feat].astype('category')
    
    return df
tra = cleaning(train)
tes = cleaning(test)

In [4]:
tra.head()

Unnamed: 0,id,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,0,e,8.8,f,s,u,f,a,c,w,...,missing,missing,w,missing,missing,f,f,missing,d,a
1,1,p,4.51,x,h,o,f,a,c,n,...,missing,y,o,missing,missing,t,z,missing,d,w
2,2,e,6.94,f,s,b,f,x,c,w,...,missing,s,n,missing,missing,f,f,missing,l,w
3,3,e,3.88,f,y,g,f,s,missing,g,...,missing,missing,w,missing,missing,f,f,missing,d,u
4,4,e,5.85,x,l,w,f,d,missing,w,...,missing,missing,w,missing,missing,f,f,missing,g,a
