# Statistical analysis of the data and removal of outliers."

In [11]:
import numpy as np
import pandas as pd

### Uploading the data

In [12]:
df = pd.read_csv("data.csv")

In [13]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 217 entries, 0 to 216
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   brand       217 non-null    object
 1   processor   217 non-null    object
 2   ram         217 non-null    int64 
 3   video-card  217 non-null    object
 4   memory      217 non-null    int64 
 5   price       217 non-null    int64 
dtypes: int64(3), object(3)
memory usage: 10.3+ KB
None


In [14]:
df["brand"] = df["brand"].astype("category")
df["processor"] = df["processor"].astype("category")
df["video-card"] = df["video-card"].astype("category")
df["ram"] = df["ram"].astype("int")
df["memory"] = df["memory"].astype("int")
df["price"] = df["price"].astype("int")

In [15]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 217 entries, 0 to 216
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   brand       217 non-null    category
 1   processor   217 non-null    category
 2   ram         217 non-null    int64   
 3   video-card  217 non-null    category
 4   memory      217 non-null    int64   
 5   price       217 non-null    int64   
dtypes: category(3), int64(3)
memory usage: 7.1 KB
None


### Finding min, max and quartile values of numeric parameters

In [16]:
columns = df.select_dtypes(include=[np.number]).columns
min_values = []
max_values = []
for column in columns:
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    min_value = Q1 - 1.5 * IQR
    max_value = Q3 + 1.5 * IQR
    min_values.append(min_value)
    max_values.append(max_value)
    print(f"Column: {column}, min: {min_value}, max: {max_value}")

Column: ram, min: -8.0, max: 56.0
Column: memory, min: -256.0, max: 1792.0
Column: price, min: -8251.0, max: 73749.0


### Cleaning the outliers

In [17]:
for i, column in enumerate(columns):
    df = df[(df[column] >= min_values[i]) & (df[column] <= max_values[i])]

In [18]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 174 entries, 0 to 215
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   brand       174 non-null    category
 1   processor   174 non-null    category
 2   ram         174 non-null    int64   
 3   video-card  174 non-null    category
 4   memory      174 non-null    int64   
 5   price       174 non-null    int64   
dtypes: category(3), int64(3)
memory usage: 7.2 KB
None


In [19]:
print(df.describe())

              ram       memory         price
count  174.000000   174.000000    174.000000
mean    20.367816   697.379310  32517.850575
std     10.152724   260.225835  14111.228172
min      8.000000   256.000000  13599.000000
25%     16.000000   512.000000  20019.000000
50%     16.000000   512.000000  29124.000000
75%     24.000000  1024.000000  39984.000000
max     48.000000  1024.000000  71739.000000


In [20]:
df.to_csv('data_cleaned.csv', index=False)