In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'top-mobile-phones-in-india-2023-on-flipkart:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F3452283%2F6035224%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240314%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240314T105137Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D9a2774ee0387e46a835a3ca9dafe3dea13baf4f34f2fa7360255bc4a50c7b546e57ad53e74475f8e71e7b0f4ab72dcc00e04c91a97a1a42a0b71bbedc9d141be1a1caca844474bc43a77a2c7808533799780d875afe26f92eeb27784890dac1e88ccba5d9d338fff288c5f4e76f94c84490cd08e2f51e6d2950c54c5540a95df3c7a3d62320a29ada57ec124993b35f2f9d34ef8432dcad7c32d52973bb9c9d6771dcc5caac31b59dc1bdb7aa86ce46bb9e5e96a910816638f35391ea8749e3bbf8cf2fc91616fc356e40ae20b19d86dbc591f5357d35a9296893f70f51e2f7bf543d73db1ca6995cf36b7cd1166556d16c82af9e4934067b03b8193ecb36484'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Then dataset used is the list of popular smart phones of 2023 in India from a known e-commerce platform.

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.impute import SimpleImputer

imported libraries which are neccessery for the process. Then I had imported the dataset.

In [None]:
df=pd.read_csv(r"/kaggle/input/top-mobile-phones-in-india-2023-on-flipkart/flipkart_top_mobiles.csv")

The first five records are as below

In [None]:
df.head()

title column is divided into 4 other columns: model,color,internal storage and RAM

In [None]:
df["model"]=df["title"].str.extract(r'(^.*?)\(')

In [None]:
df["model"]

In [None]:
df["color"]=df["title"].str.extract(r'\(([^,]*)')

In [None]:
df["color"]

In [None]:
df["internal storage"]=df["title"].str.extract(r',\s*([^)]*)')

In [None]:
df["internal storage"]

In [None]:
df["RAM"]=df["title"].str.extract(r'\([^)]*\)\s*\(([^)]*)\)')

In [None]:
df["RAM"]

finally, dropped the column 'title'

In [None]:
df=df.drop(columns="title")

In [None]:
df

Now, when we look at the metadata, there are 216 records with 10 columns.Then we listed the columns and there datatypes

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.dtypes

There are columns like price,rating_count,discount which needed to be changed to integer datatype and seller_rating to object.

In [None]:
df["price"]=df["price"].str.replace('₹','')

In [None]:
df["price"]=df["price"].str.replace(',','')

In [None]:
df["price"]=df["price"].astype(int)

In [None]:
df["price"]

In [None]:
df["rating_count"]=df["rating_count"].str.replace(',','')

In [None]:
df["rating_count"]=df["rating_count"].astype(int)

In [None]:
df["rating_count"]

In [None]:
df["discount"]=df["discount"].str.replace('%','')
df['discount'] = pd.to_numeric(df['discount'], errors='coerce')

In [None]:
df["seller_rating"]=df["seller_rating"].astype(str)

the renewed columns and datatypes are as below

In [None]:
df.dtypes

Then the Business moments are calculated:

In [None]:
df.describe()

In [None]:
df["price"].median()
df["prod_rating"].median()
df["rating_count"].median()
df["discount"].median()

In [None]:
df.mode()

In [None]:
df[["price","prod_rating","rating_count","discount"]].var()

In [None]:
df[["price","prod_rating","rating_count","discount"]].skew()

In [None]:
df[["price","prod_rating","rating_count","discount"]].kurt()

There are 16 duplicates in the data, which is dropped later

In [None]:
df.duplicated().sum()
df=df.drop_duplicates()

In [None]:
df.shape

There are missing values in the dataset as below. We need to use imputation technique to treat them.

In [None]:
df.isnull().sum()

In [None]:
df

In [None]:
df["seller_rating"].mode()

In [None]:
df["seller_rating"].fillna(4.9, inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df["discount"].mode()

In [None]:
df["discount"].fillna(25, inplace=True)

In [None]:
df["internal storage"].mode()

In [None]:
df["internal storage"].fillna(128, inplace=True)

In [None]:
df["RAM"].mode()

In [None]:
df["RAM"].fillna(4, inplace=True)

In [None]:
df.isnull().sum()

Inorder to identify the outliers in each variable, we plotted boxplot.

In [None]:
sns.boxplot(data=df["price"])

In [None]:
sns.boxplot(data=df["prod_rating"])

In [None]:
sns.boxplot(data=df["rating_count"])

In [None]:
sns.boxplot(data=df["discount"])

I had used winsorization technique to treat the outliers.

In [None]:
from scipy.stats.mstats import winsorize
df["price"]=winsorize(df["price"],limits=[0.05,0.05])

In [None]:
sns.boxplot(df["price"])

In [None]:
df["prod_rating"]=winsorize(df["prod_rating"],limits=[0.05,0.05])

In [None]:
sns.boxplot(df["prod_rating"])

In [None]:
df["rating_count"]=winsorize(df["rating_count"],limits=[0.05,0.10])

In [None]:
sns.boxplot(df["rating_count"])

In [None]:
df["discount"]=winsorize(df["discount"],limits=[0.05,0.05])
sns.boxplot(df["discount"])

Histograms are plotted for each numerical columns for getting an idea about the distribution of the data

In [None]:
sns.histplot(df["price"])

In [None]:
sns.histplot(df["prod_rating"])

In [None]:
sns.histplot(df["rating_count"])

In [None]:
sns.histplot(df["discount"])

In [None]:
sns.histplot(df["seller_rating"])

Correlation between the numerical variables are as below

In [None]:
correlation=df[["price","prod_rating","rating_count","discount"]].corr()
correlation

In [None]:
sns.heatmap(correlation)

In [None]:
df.to_csv(r"C:\Users\Admin\Downloads\flipkart_top_mobiles_processed1.csv")