# Data Cleaning - Shark Tank India

In [49]:
import pandas as pd

* Loading CSV file

In [50]:
df = pd.read_csv("ShartankIndiaAllPitches.csv")

In [51]:
df.head(5)

Unnamed: 0,Episode Number,Pitch Number,Brand,Idea,Investment Amount (In Lakhs INR),Debt (In lakhs INR),Equity,Anupam,Ashneer,Namita,Aman,Peyush,Vineeta,Ghazal,Season
0,1,1,BluePine Industries,Frozen Momos,75,0,18%,N,Y,N,Y,N,Y,N,1
1,1,2,Booz scooters,Renting e-bike for mobility in private spaces,40,0,50%,N,Y,N,N,N,Y,N,1
2,1,3,Heart up my Sleeves,Detachable Sleeves,25,0,30%,Y,N,N,N,N,Y,N,1
3,2,4,Tagz Foods,Healthy Potato Chips,70,0,2.75%,N,Y,N,N,N,N,N,1
4,2,5,Head and Heart,Brain Development Course,0,0,0,N,N,N,N,N,N,N,1


* Renaming columns 

In [52]:
df.columns

Index(['Episode Number', 'Pitch Number', 'Brand', 'Idea',
       'Investment Amount (In Lakhs INR) ', 'Debt (In lakhs INR)', 'Equity',
       'Anupam', 'Ashneer', 'Namita', 'Aman', 'Peyush', 'Vineeta', 'Ghazal',
       'Season'],
      dtype='object')

In [53]:
df.rename(columns={
    'Episode Number': 'episode',
    'Pitch Number': 'pitch_number',
    'Brand': 'startup_name',
    'Idea': 'business_idea',
    'Investment Amount (In Lakhs INR) ': 'investment_amount_lakhs',
    'Debt (In lakhs INR)': 'debt_amount_lakhs',
    'Equity': 'equity',
    'Anupam': 'anupam',
    'Ashneer': 'ashneer',
    'Namita': 'namita',
    'Aman': 'aman',
    'Peyush': 'peyush',
    'Vineeta': 'vineeta',
    'Ghazal': 'ghazal',
    'Season': 'season'
}, inplace=True)

In [54]:
df.columns

Index(['episode', 'pitch_number', 'startup_name', 'business_idea',
       'investment_amount_lakhs', 'debt_amount_lakhs', 'equity', 'anupam',
       'ashneer', 'namita', 'aman', 'peyush', 'vineeta', 'ghazal', 'season'],
      dtype='object')

* Cleaning Numeric Columns

In [55]:
df['investment_amount_lakhs'] = pd.to_numeric(df['investment_amount_lakhs'], errors='coerce').fillna(0)
df['debt_amount_lakhs'] = pd.to_numeric(df['debt_amount_lakhs'], errors='coerce').fillna(0)

In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 117 entries, 0 to 116
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   episode                  117 non-null    int64 
 1   pitch_number             117 non-null    int64 
 2   startup_name             117 non-null    object
 3   business_idea            117 non-null    object
 4   investment_amount_lakhs  117 non-null    int64 
 5   debt_amount_lakhs        117 non-null    int64 
 6   equity                   117 non-null    object
 7   anupam                   117 non-null    object
 8   ashneer                  117 non-null    object
 9   namita                   117 non-null    object
 10  aman                     117 non-null    object
 11  peyush                   117 non-null    object
 12  vineeta                  117 non-null    object
 13  ghazal                   117 non-null    object
 14  season                   117 non-null    i

* Clean Equity

In [57]:
df['equity'] = df['equity'].str.replace('%', '', regex=False)
df['equity'] = pd.to_numeric(df['equity'], errors='coerce').fillna(0)

In [58]:
df.head(5)

Unnamed: 0,episode,pitch_number,startup_name,business_idea,investment_amount_lakhs,debt_amount_lakhs,equity,anupam,ashneer,namita,aman,peyush,vineeta,ghazal,season
0,1,1,BluePine Industries,Frozen Momos,75,0,18.0,N,Y,N,Y,N,Y,N,1
1,1,2,Booz scooters,Renting e-bike for mobility in private spaces,40,0,50.0,N,Y,N,N,N,Y,N,1
2,1,3,Heart up my Sleeves,Detachable Sleeves,25,0,30.0,Y,N,N,N,N,Y,N,1
3,2,4,Tagz Foods,Healthy Potato Chips,70,0,2.75,N,Y,N,N,N,N,N,1
4,2,5,Head and Heart,Brain Development Course,0,0,0.0,N,N,N,N,N,N,N,1


In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 117 entries, 0 to 116
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   episode                  117 non-null    int64  
 1   pitch_number             117 non-null    int64  
 2   startup_name             117 non-null    object 
 3   business_idea            117 non-null    object 
 4   investment_amount_lakhs  117 non-null    int64  
 5   debt_amount_lakhs        117 non-null    int64  
 6   equity                   117 non-null    float64
 7   anupam                   117 non-null    object 
 8   ashneer                  117 non-null    object 
 9   namita                   117 non-null    object 
 10  aman                     117 non-null    object 
 11  peyush                   117 non-null    object 
 12  vineeta                  117 non-null    object 
 13  ghazal                   117 non-null    object 
 14  season                   1

* Converting Shark Votes to Binary

In [60]:
shark_columns = ['anupam', 'ashneer', 'namita', 'aman', 'peyush', 'vineeta', 'ghazal']
for shark in shark_columns:
    df[shark] = df[shark].str.upper().map({'Y': 1, 'N': 0}).fillna(0).astype(int)

In [61]:
df.head(5)

Unnamed: 0,episode,pitch_number,startup_name,business_idea,investment_amount_lakhs,debt_amount_lakhs,equity,anupam,ashneer,namita,aman,peyush,vineeta,ghazal,season
0,1,1,BluePine Industries,Frozen Momos,75,0,18.0,0,1,0,1,0,1,0,1
1,1,2,Booz scooters,Renting e-bike for mobility in private spaces,40,0,50.0,0,1,0,0,0,1,0,1
2,1,3,Heart up my Sleeves,Detachable Sleeves,25,0,30.0,1,0,0,0,0,1,0,1
3,2,4,Tagz Foods,Healthy Potato Chips,70,0,2.75,0,1,0,0,0,0,0,1
4,2,5,Head and Heart,Brain Development Course,0,0,0.0,0,0,0,0,0,0,0,1


* Saving Cleaned Data 

In [62]:
df.to_csv("Cleaned_SharkTankIndia.csv", index=False)