# Descriptive Statistics for Size, Type and Price

# Loading Packages

In [176]:
import pandas as pd
import numpy as np
#from pandas_profiling import ProfileReport
#import seaborn as sns
import matplotlib.pyplot as plt

# for Q-Q plots
import scipy.stats as stats
# from feature-engine
# from feature-engine
from feature_engine.imputation import (
    AddMissingIndicator,
    MeanMedianImputer,
    CategoricalImputer,
)

import plotly.express as px
#import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.figure_factory import create_scatterplotmatrix
from plotly.subplots import make_subplots

# to split the datasets
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

# Loading Data

In [177]:
X_train = pd.read_csv("Xtrain_mod_a.csv")
X_test = pd.read_csv("Xtest_mod_a.csv")
ytrain = pd.read_csv("ytrain.csv")
ytest = pd.read_csv("ytest.csv")
print("Shape of X Train: {}".format(X_train.shape))
print("Shape of X Test: {}".format(X_test.shape))
print("Shape of y Train: {}".format(ytrain.shape))
print("Shape of y Test: {}".format(ytest.shape))

Shape of X Train: (8672, 9)
Shape of X Test: (2169, 9)
Shape of y Train: (8672, 1)
Shape of y Test: (2169, 1)


# Size Variable

15% der Beobachtungen fallen unter die Kategorie "Varies with Device", die anderen Beobachtungen verteilen sich auf weitere 412 Kategorien/Zahlen und werden hier in 10 Kategorien gewandelt

Kilobyte in Megabyte umwandeln

In [178]:
for row in X_train.index:
    #Kilobyte in Megabyte umwandeln
    if 'k' in X_train.loc[row, 'Size']:
        X_train.loc[row, 'Size'] = X_train.loc[row, 'Size'].replace('k', '')
        X_train.loc[row, 'Size'] = float(X_train.loc[row, 'Size'])
        X_train.loc[row, 'Size'] = X_train.loc[row, 'Size'] / 1000

    # M entfernen
    elif 'M' in X_train.loc[row, 'Size']:
        X_train.loc[row, 'Size'] = X_train.loc[row, 'Size'].replace('M', '')
        X_train.loc[row, 'Size'] = float(X_train.loc[row, 'Size'])


Kontrolle, ob die Umwandlung funktioniert hat

In [179]:
print(X_train['Size'])

0       Varies with device
1                      1.3
2                    0.775
3                     42.0
4                     16.0
               ...        
8667                  88.0
8668    Varies with device
8669    Varies with device
8670                   6.8
8671    Varies with device
Name: Size, Length: 8672, dtype: object


In [180]:
len(X_train['Size'].unique())

412

In [181]:
print(X_train['Size'].describe())

count                   8672
unique                   412
top       Varies with device
freq                    1368
Name: Size, dtype: object


Nur numerische Werte von Size anschauen

In [182]:
only_numeric_values = X_train[X_train['Size'] != 'Varies with device']
print(only_numeric_values['Size'].describe())

count     7304.0
unique     411.0
top         14.0
freq       168.0
Name: Size, dtype: float64


In [183]:
data = [
    go.Histogram( x = only_numeric_values['Size'])
    ]

fig = go.Figure(data)
fig.update_xaxes(title_text='Numerische Werte von Size')
fig.update_yaxes(title_text='Totale Anzahl')
fig.show()

In [184]:
data = [
    go.Box( x = sorted(only_numeric_values['Size']))
    ]

fig = go.Figure(data)
fig.update_xaxes(title_text='Verteilung numerische Werte von Size')
fig.show()

10 gleich grosse Kategorien machen

In [185]:
only_numeric_values['SizeRange'] = pd.cut(only_numeric_values['Size'], bins=10)
print(only_numeric_values['SizeRange'])

1       (-0.0915, 10.008]
2       (-0.0915, 10.008]
3        (40.005, 50.004]
4        (10.008, 20.007]
5         (90.001, 100.0]
              ...        
8664    (-0.0915, 10.008]
8665    (-0.0915, 10.008]
8666    (-0.0915, 10.008]
8667     (80.002, 90.001]
8670    (-0.0915, 10.008]
Name: SizeRange, Length: 7304, dtype: category
Categories (10, interval[float64, right]): [(-0.0915, 10.008] < (10.008, 20.007] < (20.007, 30.006] < (30.006, 40.005] ... (60.003, 70.003] < (70.003, 80.002] < (80.002, 90.001] < (90.001, 100.0]]


Numerische Werte in 10 Kategorien einteilen

In [186]:
for row in X_train.index:

    if X_train.loc[row, 'Size'] == 'Varies with device':
        #nothing
        X_train.loc[row, 'Size'] = 'Varies with device'
    elif X_train.loc[row, 'Size'] <= 10:
        X_train.loc[row, 'Size'] = '0.1-10MB'
    elif 10.0 < X_train.loc[row, 'Size'] <= 20.0:
        X_train.loc[row, 'Size'] = '10.1-20MB'
    elif 20.0 < X_train.loc[row, 'Size'] <= 30.0:
        X_train.loc[row, 'Size'] = '20.1-30MB'
    elif 30.0 < X_train.loc[row, 'Size'] <= 40.0:
        X_train.loc[row, 'Size'] = '30.1-40MB'
    elif 40.0 < X_train.loc[row, 'Size'] <= 50.0:
        X_train.loc[row, 'Size'] = '40.1-50MB'
    elif 50.0 < X_train.loc[row, 'Size'] <= 60.0:
        X_train.loc[row, 'Size'] = '50.1-60MB'
    elif 60.0 < X_train.loc[row, 'Size'] <= 70.0:
        X_train.loc[row, 'Size'] = '60.1-70MB'
    elif 70.0 < X_train.loc[row, 'Size'] <= 80.0:
        X_train.loc[row, 'Size'] = '70.1-80MB'
    elif 80.0 < X_train.loc[row, 'Size'] <= 90.0:
        X_train.loc[row, 'Size'] = '80.1-90MB'
    elif 90 < X_train.loc[row, 'Size'] <= 100:
        X_train.loc[row, 'Size'] = '90.1-100MB'


In [187]:
print(X_train['Size'].describe())

count         8672
unique          11
top       0.1-10MB
freq          3253
Name: Size, dtype: object


In [188]:
data = [
    go.Histogram( x = X_train['Size'])
    ]

fig = go.Figure(data)
fig.update_xaxes(title_text='Size')
fig.update_yaxes(title_text='Totale Anzahl')
fig.show()

In [189]:
data = [
    go.Histogram( x = X_train['Size'], histnorm='probability')
    ]

fig = go.Figure(data)
fig.update_xaxes(title_text='Size')
fig.update_yaxes(title_text='Anteil in Prozent')
fig.show()

# Type Variable

In [190]:
data = [
    go.Histogram( x = X_train['Type'])
    ]

fig = go.Figure(data)
fig.update_xaxes(title_text='Type')
fig.update_yaxes(title_text='Totale Anzahl')
fig.show()

In [191]:
data = [
    go.Histogram( x = X_train['Type'],
                  histnorm = 'probability')
    ]

fig = go.Figure(data)
fig.update_xaxes(title_text='Types')
fig.update_yaxes(title_text='Anteil in Prozent')
fig.show()

In [192]:
print(X_train['Type'].describe())

count     8672
unique       2
top       Free
freq      8020
Name: Type, dtype: object


# Price Variable

Der Preis 0 Dollar macht 92,48% aus

Dollarzeichen entfernen:

In [193]:
for row in X_train.index:
    #Dollarzeichen entfernen
    if '$' in X_train.loc[row, 'Price']:
        X_train.loc[row, 'Price'] = X_train.loc[row, 'Price'].replace('$', '')


In [194]:
data = [
    go.Histogram( x = X_train['Price'])
    ]

fig = go.Figure(data)
fig.update_xaxes(title_text='Price')
fig.update_yaxes(title_text='Totale Anzahl')
fig.show()

In [195]:
data = [
    go.Histogram( x = X_train['Price'],
                  histnorm = 'probability')
    ]

fig = go.Figure(data)
fig.update_xaxes(title_text='Prices')
fig.update_yaxes(title_text='Anteil in Prozent')
fig.show()

In [196]:
print(X_train['Price'].describe())

count     8672
unique      79
top          0
freq      8020
Name: Price, dtype: object


# Save

In [197]:
X_train.to_csv("Xtrain_mod_b.csv",index=False)
X_test.to_csv("Xtest_mod_b.csv",index=False)