In [1]:
# Import our dependencies
import os 
import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
import numpy as np
import tensorflow as tf

### Preprocessing Data 

In [2]:
#Import the dataset 
companies_df = pd.read_csv('Unicorn_companies.csv')
new_companies_df = companies_df.drop(['Portfolio Exits'],axis = 1)
new_companies_df.head()

Unnamed: 0,Company,Valuation ($B),Date Joined,Country,City,Industry,Select Inverstors,Founded Year,Total Raised,Financial Stage,Investors Count,Deal Terms
0,Bytedance,$140,4/7/2017,China,Beijing,Artificial intelligence,"Sequoia Capital China, SIG Asia Investments, S...",2012,$7.44B,IPO,28,8
1,SpaceX,$100.3,12/1/2012,United States,Hawthorne,Other,"Founders Fund, Draper Fisher Jurvetson, Rothen...",2002,$6.874B,,29,12
2,Stripe,$95,1/23/2014,United States,San Francisco,Fintech,"Khosla Ventures, LowercaseCapital, capitalG",2010,$2.901B,Asset,39,12
3,Klarna,$45.6,12/12/2011,Sweden,Stockholm,Fintech,"Institutional Venture Partners, Sequoia Capita...",2005,$3.472B,Acquired,56,13
4,Epic Games,$42,10/26/2018,United States,Cary,Other,"Tencent Holdings, KKR, Smash Ventures",1991,$4.377B,Acquired,25,5


In [3]:
# Convert Valuation column to float
new_companies_df['Valuation ($B)']= new_companies_df['Valuation ($B)'].str.replace('$','')
new_companies_df['Valuation ($B)'] = new_companies_df['Valuation ($B)'].astype(float)


  


In [4]:


#Convert Total Raised column to float
#Credit to https://stackoverflow.com/questions/71543484/kernel-dies-when-processing-dataframe

new_companies_df['Total Raised']= new_companies_df['Total Raised'].str.replace('$','')

def replace(x):
  val_dict = {"M": 1000000, "B": 1000000000,'K':1000,'None': 0}

  for key in val_dict:
    if key in x:
      value = float(x.replace(key, "0")) #float successful after replacing 'B'with '0'
      multiplier = float(val_dict[key])
      return value * multiplier

new_companies_df['Total Raised($)']  = new_companies_df['Total Raised'].apply(replace)
new_companies_df.head()

  after removing the cwd from sys.path.


Unnamed: 0,Company,Valuation ($B),Date Joined,Country,City,Industry,Select Inverstors,Founded Year,Total Raised,Financial Stage,Investors Count,Deal Terms,Total Raised($)
0,Bytedance,140.0,4/7/2017,China,Beijing,Artificial intelligence,"Sequoia Capital China, SIG Asia Investments, S...",2012,7.44B,IPO,28,8,7440000000.0
1,SpaceX,100.3,12/1/2012,United States,Hawthorne,Other,"Founders Fund, Draper Fisher Jurvetson, Rothen...",2002,6.874B,,29,12,6874000000.0
2,Stripe,95.0,1/23/2014,United States,San Francisco,Fintech,"Khosla Ventures, LowercaseCapital, capitalG",2010,2.901B,Asset,39,12,2901000000.0
3,Klarna,45.6,12/12/2011,Sweden,Stockholm,Fintech,"Institutional Venture Partners, Sequoia Capita...",2005,3.472B,Acquired,56,13,3472000000.0
4,Epic Games,42.0,10/26/2018,United States,Cary,Other,"Tencent Holdings, KKR, Smash Ventures",1991,4.377B,Acquired,25,5,4377000000.0


In [5]:
#Drop old Total Raised col
new_companies_df = new_companies_df.drop(['Total Raised'],axis=1)
val_counts = new_companies_df['Total Raised($)']
val_counts.dtype

dtype('float64')

In [6]:
#Bin Total Raised($)
invested_bins = [0,1000,1000000,1000000000,1000000000000]
val_counts.groupby(pd.cut(val_counts,invested_bins)).count()

invested_groups = ['<1000','1001-1000000','1000001-1000000000','1000000001-1000000000000']
new_companies_df['Total Raised Ranges($)'] = pd.cut(val_counts,invested_bins, labels=invested_groups)
new_companies_df.head()

Unnamed: 0,Company,Valuation ($B),Date Joined,Country,City,Industry,Select Inverstors,Founded Year,Financial Stage,Investors Count,Deal Terms,Total Raised($),Total Raised Ranges($)
0,Bytedance,140.0,4/7/2017,China,Beijing,Artificial intelligence,"Sequoia Capital China, SIG Asia Investments, S...",2012,IPO,28,8,7440000000.0,1000000001-1000000000000
1,SpaceX,100.3,12/1/2012,United States,Hawthorne,Other,"Founders Fund, Draper Fisher Jurvetson, Rothen...",2002,,29,12,6874000000.0,1000000001-1000000000000
2,Stripe,95.0,1/23/2014,United States,San Francisco,Fintech,"Khosla Ventures, LowercaseCapital, capitalG",2010,Asset,39,12,2901000000.0,1000000001-1000000000000
3,Klarna,45.6,12/12/2011,Sweden,Stockholm,Fintech,"Institutional Venture Partners, Sequoia Capita...",2005,Acquired,56,13,3472000000.0,1000000001-1000000000000
4,Epic Games,42.0,10/26/2018,United States,Cary,Other,"Tencent Holdings, KKR, Smash Ventures",1991,Acquired,25,5,4377000000.0,1000000001-1000000000000


In [9]:
#Change Date joined to Year and rename column
# new_companies_df['Date Joined'] = pd.to_datetime(new_companies_df['Date Joined']).dt.to_period('y')
new_companies_df['Date Joined'] = pd.to_datetime(new_companies_df['Date Joined'],infer_datetime_format=True).dt.strftime("%Y")
new_companies_df.rename(columns={'Date Joined':'Year Joined'},inplace=True)
new_companies_df.head()

Unnamed: 0,Company,Valuation ($B),Year Joined,Country,City,Industry,Select Inverstors,Founded Year,Financial Stage,Investors Count,Deal Terms,Total Raised($),Total Raised Ranges($)
0,Bytedance,140.0,2017,China,Beijing,Artificial intelligence,"Sequoia Capital China, SIG Asia Investments, S...",2012,IPO,28,8,7440000000.0,1000000001-1000000000000
1,SpaceX,100.3,2012,United States,Hawthorne,Other,"Founders Fund, Draper Fisher Jurvetson, Rothen...",2002,,29,12,6874000000.0,1000000001-1000000000000
2,Stripe,95.0,2014,United States,San Francisco,Fintech,"Khosla Ventures, LowercaseCapital, capitalG",2010,Asset,39,12,2901000000.0,1000000001-1000000000000
3,Klarna,45.6,2011,Sweden,Stockholm,Fintech,"Institutional Venture Partners, Sequoia Capita...",2005,Acquired,56,13,3472000000.0,1000000001-1000000000000
4,Epic Games,42.0,2018,United States,Cary,Other,"Tencent Holdings, KKR, Smash Ventures",1991,Acquired,25,5,4377000000.0,1000000001-1000000000000


In [19]:
new_companies_df = new_companies_df[new_companies_df['Founded Year'] !='None']
new_companies_df['Founded Year'].value_counts()
new_companies_df['Transition Time']= new_companies_df['Year Joined'].astype(int) - new_companies_df['Founded Year'].astype(int)
new_companies_df.head()

Unnamed: 0,Company,Valuation ($B),Year Joined,Country,City,Industry,Select Inverstors,Founded Year,Financial Stage,Investors Count,Deal Terms,Total Raised($),Total Raised Ranges($),Transition Time
0,Bytedance,140.0,2017,China,Beijing,Artificial intelligence,"Sequoia Capital China, SIG Asia Investments, S...",2012,IPO,28,8,7440000000.0,1000000001-1000000000000,5
1,SpaceX,100.3,2012,United States,Hawthorne,Other,"Founders Fund, Draper Fisher Jurvetson, Rothen...",2002,,29,12,6874000000.0,1000000001-1000000000000,10
2,Stripe,95.0,2014,United States,San Francisco,Fintech,"Khosla Ventures, LowercaseCapital, capitalG",2010,Asset,39,12,2901000000.0,1000000001-1000000000000,4
3,Klarna,45.6,2011,Sweden,Stockholm,Fintech,"Institutional Venture Partners, Sequoia Capita...",2005,Acquired,56,13,3472000000.0,1000000001-1000000000000,6
4,Epic Games,42.0,2018,United States,Cary,Other,"Tencent Holdings, KKR, Smash Ventures",1991,Acquired,25,5,4377000000.0,1000000001-1000000000000,27


In [None]:
#Replace financial stage with 'Other'
# replace_stage = list(val_counts[val_counts <= 1.0].index)

# #Replace in df
# for values in replace_stage:
#     companies_df['Valuation ($B)']=companies_df['Valuation ($B)'].replace(values, "Other")
    
# #Verify binning was successful 
# companies_df["Valuation ($B)"].value_counts()

In [20]:
#Generate our categorical variable list 
# companies_cat = new_companies_df.dtypes[new_companies_df.dtypes==["object"]].index.tolist()
companies_cat = new_companies_df.dtypes.index.tolist()

In [21]:
#Check for unique values
new_companies_df[companies_cat].nunique()

Company                   992
Valuation ($B)            192
Year Joined                12
Country                    45
City                      248
Industry                   32
Select Inverstors         966
Founded Year               36
Financial Stage            10
Investors Count            53
Deal Terms                 16
Total Raised($)           885
Total Raised Ranges($)      2
Transition Time            31
dtype: int64

In [None]:
# #Create a OneHotEncoder instance
# enc = OneHotEncoder(sparse = False)

# #Fit and transform the OneHotEncoder using the categorical variable list 
# encode_df = pd.DataFrame(enc.fit_transform(new_companies_df[companies_cat]))

# #Add the encoded variable names to the df
# encode_df.columns = enc.get_feature_names_out(companies_cat)
# encode_df.head()