In [9]:
!pip install pyforest --quiet

In [10]:
import pyforest

### Gathering the Dataset

In [11]:
# Creating folder for our dataset on local system if it doesn't exist

folder_name = 'datasets'

if not os.path.exists(folder_name):
  os.makedirs(folder_name)

<IPython.core.display.Javascript object>

In [12]:
# Downloading dataset files from url into our local machine

# url_data = 'https://raw.githubusercontent.com/idowujames/unicorn-companies/main/Unicorn_Companies.csv'

# r_data = requests.get(url_data)

# with open ('datasets/Unicorn_Companies.csv', 'wb') as file:
#   file.write(r_data.content)

In [13]:
# df_data = pd.read_csv('Data_Dictionary.csv')
df_comp = pd.read_csv('https://raw.githubusercontent.com/idowujames/unicorn-companies/main/Unicorn_Companies.csv')

<IPython.core.display.Javascript object>

In [14]:
df_comp.sample(n=8, random_state=1)

Unnamed: 0,Company,Valuation,Date Joined,Industry,City,Country,Continent,Year Founded,Funding,Select Investors
635,Qingting FM,$1B,2021-06-02,Internet software & services,Shanghai,China,Asia,2011,$164M,"China Culture Industrial Investment Fund, We C..."
428,Zenoti,$2B,2020-12-15,Internet software & services,Bellevue,United States,North America,2010,$331M,"Norwest Venture Partners, Accel, Tiger Global ..."
49,OutSystems,$10B,2018-06-05,Internet software & services,Boston,United States,North America,2001,$572M,"KKR, ES Ventures, North Bridge Growth Equity"
204,Articulate,$4B,2021-07-01,Edtech,New York,United States,North America,2015,$2B,"Blackstone, ICONIQ Growth, General Atlantic"
998,PicsArt,$1B,2021-08-26,Mobile & telecommunications,San Francisco,United States,North America,2011,$195M,"Sequoia Capital, DCM Ventures, Insight Partners"
80,Ramp,$8B,2021-03-29,Fintech,New York,United States,North America,2019,$660M,"D1 Capital Partners, Stripe, Coatue Management"
345,Outschool,$3B,2021-04-14,Edtech,San Francisco,United States,North America,2015,$240M,"Uniion Square Ventures, Tiger Global Managemen..."
531,Moveworks,$2B,2021-06-30,Artificial intelligence,Mountain View,United States,North America,2016,$305M,"Lightspeed Venture Partners, Sapphire Ventures..."


In [15]:
df_comp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1074 entries, 0 to 1073
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Company           1074 non-null   object
 1   Valuation         1074 non-null   object
 2   Date Joined       1074 non-null   object
 3   Industry          1074 non-null   object
 4   City              1058 non-null   object
 5   Country           1074 non-null   object
 6   Continent         1074 non-null   object
 7   Year Founded      1074 non-null   int64 
 8   Funding           1074 non-null   object
 9   Select Investors  1073 non-null   object
dtypes: int64(1), object(9)
memory usage: 84.0+ KB


## Questions to be answered
- Which unicorn companies have had the biggest return on investment?

- How long does it usually take for a company to become a unicorn? Has it always been this way?

- Which countries have the most unicorns? Are there any cities that appear to be industry hubs?

- Which investors have funded the most unicorns?

### Data Cleaning

- Create a copy of the dataframe for cleaning
- Change valuation from object datatype to float datatype
- Change date joined from object datatype to datetime
- Change funding from object datatype to float datatype
- Rename all columns to be more pythonic
- Deal with missing values

### Creating a copy of the dataframe for cleaning


In [16]:
df = df_comp.copy()

df.head()

Unnamed: 0,Company,Valuation,Date Joined,Industry,City,Country,Continent,Year Founded,Funding,Select Investors
0,Bytedance,$180B,2017-04-07,Artificial intelligence,Beijing,China,Asia,2012,$8B,"Sequoia Capital China, SIG Asia Investments, S..."
1,SpaceX,$100B,2012-12-01,Other,Hawthorne,United States,North America,2002,$7B,"Founders Fund, Draper Fisher Jurvetson, Rothen..."
2,SHEIN,$100B,2018-07-03,E-commerce & direct-to-consumer,Shenzhen,China,Asia,2008,$2B,"Tiger Global Management, Sequoia Capital China..."
3,Stripe,$95B,2014-01-23,Fintech,San Francisco,United States,North America,2010,$2B,"Khosla Ventures, LowercaseCapital, capitalG"
4,Klarna,$46B,2011-12-12,Fintech,Stockholm,Sweden,Europe,2005,$4B,"Institutional Venture Partners, Sequoia Capita..."


#### Changing valuation from object datatype to float datatype

In [17]:
df['Valuation'] = df['Valuation'].str.extract('(\d+)').astype('float')

In [18]:
# Rename column to reflect that the values are in billion dollars

df.rename(columns={'Valuation': 'Valuation(B/$)'}, inplace = True)

### Test

In [19]:
df.info()
df.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1074 entries, 0 to 1073
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Company           1074 non-null   object 
 1   Valuation(B/$)    1074 non-null   float64
 2   Date Joined       1074 non-null   object 
 3   Industry          1074 non-null   object 
 4   City              1058 non-null   object 
 5   Country           1074 non-null   object 
 6   Continent         1074 non-null   object 
 7   Year Founded      1074 non-null   int64  
 8   Funding           1074 non-null   object 
 9   Select Investors  1073 non-null   object 
dtypes: float64(1), int64(1), object(8)
memory usage: 84.0+ KB


Unnamed: 0,Company,Valuation(B/$),Date Joined,Industry,City,Country,Continent,Year Founded,Funding,Select Investors
0,Bytedance,180.0,2017-04-07,Artificial intelligence,Beijing,China,Asia,2012,$8B,"Sequoia Capital China, SIG Asia Investments, S..."
1,SpaceX,100.0,2012-12-01,Other,Hawthorne,United States,North America,2002,$7B,"Founders Fund, Draper Fisher Jurvetson, Rothen..."
2,SHEIN,100.0,2018-07-03,E-commerce & direct-to-consumer,Shenzhen,China,Asia,2008,$2B,"Tiger Global Management, Sequoia Capital China..."


### Change date joined from object datatype to datetime

In [20]:
df['Date Joined'] = df['Date Joined'].apply(pd.to_datetime)
df.info()

<IPython.core.display.Javascript object>

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1074 entries, 0 to 1073
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Company           1074 non-null   object        
 1   Valuation(B/$)    1074 non-null   float64       
 2   Date Joined       1074 non-null   datetime64[ns]
 3   Industry          1074 non-null   object        
 4   City              1058 non-null   object        
 5   Country           1074 non-null   object        
 6   Continent         1074 non-null   object        
 7   Year Founded      1074 non-null   int64         
 8   Funding           1074 non-null   object        
 9   Select Investors  1073 non-null   object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(7)
memory usage: 84.0+ KB


### Changing funding from object datatype to float datatype

In [21]:
import re
def funds(x):
  if 'B' in x:
    value  = float(re.findall('\d+', x)[0])
  elif 'M' in x:
    value = round(float(re.findall('\d+', x)[0]) * 0.001, 3)
  else:
    # Using an abituary low number to reflect missing or unknown values
    value = -999
  return value

df['Funding'] = df['Funding'].apply(funds)

### Renaming the Funding column to reflect that it is in billions

In [23]:
df.rename(columns={'Funding': 'Funding(B/$)'}, inplace = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1074 entries, 0 to 1073
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Company           1074 non-null   object        
 1   Valuation(B/$)    1074 non-null   float64       
 2   Date Joined       1074 non-null   datetime64[ns]
 3   Industry          1074 non-null   object        
 4   City              1058 non-null   object        
 5   Country           1074 non-null   object        
 6   Continent         1074 non-null   object        
 7   Year Founded      1074 non-null   int64         
 8   Funding           1074 non-null   float64       
 9   Select Investors  1073 non-null   object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(6)
memory usage: 84.0+ KB


In [None]:
df_comp

Unnamed: 0,Company,Valuation,Date Joined,Industry,City,Country,Continent,Year Founded,Funding,Select Investors
0,Bytedance,$180B,2017-04-07,Artificial intelligence,Beijing,China,Asia,2012,$8B,"Sequoia Capital China, SIG Asia Investments, S..."
1,SpaceX,$100B,2012-12-01,Other,Hawthorne,United States,North America,2002,$7B,"Founders Fund, Draper Fisher Jurvetson, Rothen..."
2,SHEIN,$100B,2018-07-03,E-commerce & direct-to-consumer,Shenzhen,China,Asia,2008,$2B,"Tiger Global Management, Sequoia Capital China..."
3,Stripe,$95B,2014-01-23,Fintech,San Francisco,United States,North America,2010,$2B,"Khosla Ventures, LowercaseCapital, capitalG"
4,Klarna,$46B,2011-12-12,Fintech,Stockholm,Sweden,Europe,2005,$4B,"Institutional Venture Partners, Sequoia Capita..."
...,...,...,...,...,...,...,...,...,...,...
1069,Zhaogang,$1B,2017-06-29,E-commerce & direct-to-consumer,Shanghai,China,Asia,2012,$379M,"K2 Ventures, Matrix Partners China, IDG Capital"
1070,Zhuan Zhuan,$1B,2017-04-18,E-commerce & direct-to-consumer,Beijing,China,Asia,2015,$990M,"58.com, Tencent Holdings"
1071,Zihaiguo,$1B,2021-05-06,Consumer & retail,Chongqing,China,Asia,2018,$80M,"Xingwang Investment Management, China Capital ..."
1072,Zopa,$1B,2021-10-19,Fintech,London,United Kingdom,Europe,2005,$792M,"IAG Capital Partners, Augmentum Fintech, North..."
