In [14]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import ipywidgets as widgets
from ipywidgets import interact
from ipywidgets import interact_manual

### Read data

In [15]:
data = pd.read_csv('./data/Startups_funding.csv')

data.columns = ['SNo', 'Date', 'StartupName', 'IndustryVertical', 'SubVertical', 'City', 'InvestorsName', 'InvestmentType', 'AmountInUSD', 'Remarks']

# Clean the strings
def clean_strings(x):
    return str(x).replace('\\xc2\\xa0', '').replace('\\\\xc2\\\\xa0', '')

# Apply the function to clean the data
for cl in ['StartupName', 'IndustryVertical', 'SubVertical', 'City', 'InvestorsName', 'InvestmentType', 'AmountInUSD', 'Remarks']:
    data[cl] = data[cl].apply(lambda x: clean_strings(x))
    
data.head()

Unnamed: 0,SNo,Date,StartupName,IndustryVertical,SubVertical,City,InvestorsName,InvestmentType,AmountInUSD,Remarks
0,1,09/01/2020,BYJU’S,E-Tech,E-learning,Bengaluru,Tiger Global Management,Private Equity Round,200000000,
1,2,13/01/2020,Shuttl,Transportation,App based shuttle service,Gurgaon,Susquehanna Growth Equity,Series C,8048394,
2,3,09/01/2020,Mamaearth,E-commerce,Retailer of baby and toddler products,Bengaluru,Sequoia Capital India,Series B,18358860,
3,4,02/01/2020,https://www.wealthbucket.in/,FinTech,Online Investment,New Delhi,Vinod Khatumal,Pre-series A,3000000,
4,5,02/01/2020,Fashor,Fashion and Apparel,Embroiled Clothes For Women,Mumbai,Sprout Venture Partners,Seed Round,1800000,


In [16]:
data.columns

Index(['SNo', 'Date', 'StartupName', 'IndustryVertical', 'SubVertical', 'City',
       'InvestorsName', 'InvestmentType', 'AmountInUSD', 'Remarks'],
      dtype='object')

In [17]:
print("Size of data:", data.shape)

Size of data: (3044, 10)


### Cleaning data

In [21]:
warnings.filterwarnings('ignore')

# Calculate the total missing values in the data
total = data.isnull().sum().sort_values(ascending = False)

# Calculate the percentage of missing values in the data
percent = ((data.isnull().sum()/data.isnull().count())*100).sort_values(ascending = False)

# Store the above two values in a dataset called missing data
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent %'])

missing_data

Unnamed: 0,Total,Percent %
Remarks,0,0.0
AmountInUSD,0,0.0
InvestmentType,0,0.0
InvestorsName,0,0.0
City,0,0.0
SubVertical,0,0.0
IndustryVertical,0,0.0
StartupName,0,0.0
Date,0,0.0
SNo,0,0.0
