# Kieran Molloy UCDPA Project CIDAB 2022-01-18
***

## Environment Information
***
<table align="left">
<tr>
    <th>Environment Type</th>
    <th>Anaconda Version</th>
    <th>Anaconda Build Channel</th>
    <th>Python Version</th>
</tr>
<tr>
    <td>Anaconda </td>
    <td>2021.11 </td>
    <td>py39_0 </td>
    <td>3.9.7 </td>
</tr>
</table>    

## Datasource
***
<table align="left">
<tr>
    <td>https://www.kaggle.com/datasets/deepcontractor/unicorn-companies-dataset?select=Unicorn_Companies.csv</td>
</tr>
</table>

## Import Required Modules
***

In [31]:
#!pip install geonamescache
#!pip install geotext

#!pip install geopy

Found existing installation: spacy 3.2.3
Uninstalling spacy-3.2.3:
  Successfully uninstalled spacy-3.2.3


In [None]:
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import re

import geonamescache #https://pypi.org/project/geonamescache/
from geotext import GeoText

## Variables
***

In [3]:
csv = 'Unicorn_Companies.csv'

## Functions
***

In [4]:
def fn_set_float(value):
    """remove non numeric string and convert numeric to float """
    if value == 'None' or value == 'NaN':
      pass
    elif re.search('[Mm]+',value):
        stripped = float(re.search(r'\d+',value).group())
        return stripped/1000
    elif re.search('[Kk]+',value):
        stripped = float(re.search(r'\d+',value).group())
        return stripped/100000
    else:
        stripped = float(re.search(r'\d+',value).group())
        return stripped

In [5]:
def fn_set_int(value):
    """convert float to int"""
    if value == 'None' or value == 'NaN':
      pass
    else:
        toint = int(value)
        return toint

In [6]:
def converttobillions(value):
      
    pass

In [7]:
def function4():
    
    pass

## Initial CSV import 
***

In [8]:
# read the CSV from the local Jupyter Notebook directory 
df_unicorn_companies_initial = pd.read_csv(csv)

## Inspect Data
***

In [9]:
display(df_unicorn_companies_initial.head(10))

Unnamed: 0,Company,Valuation ($B),Date Joined,Country,City,Industry,Select Inverstors,Founded Year,Total Raised,Financial Stage,Investors Count,Deal Terms,Portfolio Exits
0,Bytedance,$140,4/7/2017,China,Beijing,Artificial intelligence,"Sequoia Capital China, SIG Asia Investments, S...",2012,$7.44B,IPO,28,8,5.0
1,SpaceX,$100.3,12/1/2012,United States,Hawthorne,Other,"Founders Fund, Draper Fisher Jurvetson, Rothen...",2002,$6.874B,,29,12,
2,Stripe,$95,1/23/2014,United States,San Francisco,Fintech,"Khosla Ventures, LowercaseCapital, capitalG",2010,$2.901B,Asset,39,12,1.0
3,Klarna,$45.6,12/12/2011,Sweden,Stockholm,Fintech,"Institutional Venture Partners, Sequoia Capita...",2005,$3.472B,Acquired,56,13,1.0
4,Epic Games,$42,10/26/2018,United States,Cary,Other,"Tencent Holdings, KKR, Smash Ventures",1991,$4.377B,Acquired,25,5,2.0
5,Canva,$40,1/8/2018,Australia,Surry Hills,Internet software & services,"Sequoia Capital China, Blackbird Ventures, Mat...",2012,$571.26M,,26,8,
6,Checkout.com,$40,5/2/2019,United Kingdom,London,Fintech,"Tiger Global Management, Insight Partners, DST...",2012,$1.83B,,15,4,
7,Instacart,$39,12/30/2014,United States,San Francisco,"Supply chain, logistics, & delivery","Khosla Ventures, Kleiner Perkins Caufield & By...",2012,$2.686B,,29,12,
8,Databricks,$38,2/5/2019,United States,San Francisco,Data management & analytics,"Andreessen Horowitz, New Enterprise Associates...",2013,$3.497B,,29,8,
9,Revolut,$33,4/26/2018,United Kingdom,London,Fintech,"index Ventures, DST Global, Ribbit Capital",2015,$1.716B,,31,6,


In [10]:
# all columns are imported as object datatypes
print(df_unicorn_companies_initial.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1037 entries, 0 to 1036
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Company            1037 non-null   object
 1   Valuation ($B)     1037 non-null   object
 2   Date Joined        1037 non-null   object
 3   Country            1037 non-null   object
 4   City               1037 non-null   object
 5   Industry           1037 non-null   object
 6   Select Inverstors  1037 non-null   object
 7   Founded Year       1037 non-null   object
 8   Total Raised       1037 non-null   object
 9   Financial Stage    1037 non-null   object
 10  Investors Count    1037 non-null   object
 11  Deal Terms         1037 non-null   object
 12  Portfolio Exits    1037 non-null   object
dtypes: object(13)
memory usage: 105.4+ KB
None


In [11]:
# determine if there are any isna missing values , none found
print(df_unicorn_companies_initial.isna().any())

Company              False
Valuation ($B)       False
Date Joined          False
Country              False
City                 False
Industry             False
Select Inverstors    False
Founded Year         False
Total Raised         False
Financial Stage      False
Investors Count      False
Deal Terms           False
Portfolio Exits      False
dtype: bool


In [12]:
# determine if there are any isnull missing values , none found
print(df_unicorn_companies_initial.isnull().sum())

Company              0
Valuation ($B)       0
Date Joined          0
Country              0
City                 0
Industry             0
Select Inverstors    0
Founded Year         0
Total Raised         0
Financial Stage      0
Investors Count      0
Deal Terms           0
Portfolio Exits      0
dtype: int64


## Import CSV Data
***

In [13]:
# read_csv with some data manupilation on import , remove unwanted characters, change datatypes , change 'None' to na_values
df_unicorn_companies = pd.read_csv(csv, parse_dates=['Date Joined'], na_values='None',converters={'Valuation ($B)':fn_set_float,'Total Raised': fn_set_float}, dtype={'Founded Year': 'Int64','Investors Count': 'Int64','Deal Terms': 'Int64','Portfolio Exits': 'Int64'})

In [14]:
# re-check datatypes , all seem preferred
print(df_unicorn_companies.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1037 entries, 0 to 1036
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Company            1037 non-null   object        
 1   Valuation ($B)     1037 non-null   float64       
 2   Date Joined        1037 non-null   datetime64[ns]
 3   Country            1037 non-null   object        
 4   City               1037 non-null   object        
 5   Industry           1037 non-null   object        
 6   Select Inverstors  1020 non-null   object        
 7   Founded Year       994 non-null    Int64         
 8   Total Raised       1013 non-null   float64       
 9   Financial Stage    49 non-null     object        
 10  Investors Count    1036 non-null   Int64         
 11  Deal Terms         1008 non-null   Int64         
 12  Portfolio Exits    49 non-null     Int64         
dtypes: Int64(4), datetime64[ns](1), float64(2), object(6)
memory us

In [15]:
x=df_unicorn_companies.head(1000).sort_values('Total Raised')
x[x['Company'] == 'Elemy']

Unnamed: 0,Company,Valuation ($B),Date Joined,Country,City,Industry,Select Inverstors,Founded Year,Total Raised,Financial Stage,Investors Count,Deal Terms,Portfolio Exits
733,Elemy,1.0,2021-10-06,United States,San Francisco,Health,"General Catalyst, Bling Capital, Felicis Ventures",,0.0001,,1,,


In [16]:
# re-check for NaN / None
print(df_unicorn_companies.isna().any())

Company              False
Valuation ($B)       False
Date Joined          False
Country              False
City                 False
Industry             False
Select Inverstors     True
Founded Year          True
Total Raised          True
Financial Stage       True
Investors Count       True
Deal Terms            True
Portfolio Exits       True
dtype: bool


In [17]:
# re-check count of NaN / None
df_unicorn_companies.isnull().sum()

Company                0
Valuation ($B)         0
Date Joined            0
Country                0
City                   0
Industry               0
Select Inverstors     17
Founded Year          43
Total Raised          24
Financial Stage      988
Investors Count        1
Deal Terms            29
Portfolio Exits      988
dtype: int64

In [18]:
# using display instead of print for better formatting in Notebook only .. understood print is preferable in commandline
display(df_unicorn_companies)

Unnamed: 0,Company,Valuation ($B),Date Joined,Country,City,Industry,Select Inverstors,Founded Year,Total Raised,Financial Stage,Investors Count,Deal Terms,Portfolio Exits
0,Bytedance,140.0,2017-04-07,China,Beijing,Artificial intelligence,"Sequoia Capital China, SIG Asia Investments, S...",2012,7.000,IPO,28,8,5
1,SpaceX,100.0,2012-12-01,United States,Hawthorne,Other,"Founders Fund, Draper Fisher Jurvetson, Rothen...",2002,6.000,,29,12,
2,Stripe,95.0,2014-01-23,United States,San Francisco,Fintech,"Khosla Ventures, LowercaseCapital, capitalG",2010,2.000,Asset,39,12,1
3,Klarna,45.0,2011-12-12,Sweden,Stockholm,Fintech,"Institutional Venture Partners, Sequoia Capita...",2005,3.000,Acquired,56,13,1
4,Epic Games,42.0,2018-10-26,United States,Cary,Other,"Tencent Holdings, KKR, Smash Ventures",1991,4.000,Acquired,25,5,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1032,Timescale,1.0,2022-02-22,United States,New York,Internet software & services,"New Enterprise Associates, Benchmark, Two Sigm...",2015,0.181,,7,2,
1033,Scalapay,1.0,2022-02-23,Italy,Milan,Fintech,"Fasanara Capital, Tiger Global Management, Bal...",2019,0.700,,10,2,
1034,Omada Health,1.0,2022-02-23,United States,San Francisco,Health,"U.S. Venture Partners, dRx Capital, Andreessen...",2011,0.449,,30,6,
1035,BlueVoyant,1.0,2022-02-23,United States,New York,Cybersecurity,"8VC, Liberty Strategic Capital, Eden Global Pa...",2017,0.525,,6,2,


## Clean Data
***

In [19]:
# get the index and row label of the Dataframe
print(df_unicorn_companies.index)

RangeIndex(start=0, stop=1037, step=1)


In [20]:
# using display instead for print for better formatting in Notebook only .. understood print is preferable in commandline
# Company column showing as Index 
display(df_unicorn_companies)

Unnamed: 0,Company,Valuation ($B),Date Joined,Country,City,Industry,Select Inverstors,Founded Year,Total Raised,Financial Stage,Investors Count,Deal Terms,Portfolio Exits
0,Bytedance,140.0,2017-04-07,China,Beijing,Artificial intelligence,"Sequoia Capital China, SIG Asia Investments, S...",2012,7.000,IPO,28,8,5
1,SpaceX,100.0,2012-12-01,United States,Hawthorne,Other,"Founders Fund, Draper Fisher Jurvetson, Rothen...",2002,6.000,,29,12,
2,Stripe,95.0,2014-01-23,United States,San Francisco,Fintech,"Khosla Ventures, LowercaseCapital, capitalG",2010,2.000,Asset,39,12,1
3,Klarna,45.0,2011-12-12,Sweden,Stockholm,Fintech,"Institutional Venture Partners, Sequoia Capita...",2005,3.000,Acquired,56,13,1
4,Epic Games,42.0,2018-10-26,United States,Cary,Other,"Tencent Holdings, KKR, Smash Ventures",1991,4.000,Acquired,25,5,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1032,Timescale,1.0,2022-02-22,United States,New York,Internet software & services,"New Enterprise Associates, Benchmark, Two Sigm...",2015,0.181,,7,2,
1033,Scalapay,1.0,2022-02-23,Italy,Milan,Fintech,"Fasanara Capital, Tiger Global Management, Bal...",2019,0.700,,10,2,
1034,Omada Health,1.0,2022-02-23,United States,San Francisco,Health,"U.S. Venture Partners, dRx Capital, Andreessen...",2011,0.449,,30,6,
1035,BlueVoyant,1.0,2022-02-23,United States,New York,Cybersecurity,"8VC, Liberty Strategic Capital, Eden Global Pa...",2017,0.525,,6,2,


In [21]:
# rename 'Select Inverstors' column name typo and 'Total Raised' added $B to column name in Dataframe 
df_unicorn_companies.rename(columns = {'Select Inverstors':'Select Investors','Total Raised':'Total Raised($B)'}, inplace = True)
# show both renames successful
display(df_unicorn_companies)

Unnamed: 0,Company,Valuation ($B),Date Joined,Country,City,Industry,Select Investors,Founded Year,Total Raised($B),Financial Stage,Investors Count,Deal Terms,Portfolio Exits
0,Bytedance,140.0,2017-04-07,China,Beijing,Artificial intelligence,"Sequoia Capital China, SIG Asia Investments, S...",2012,7.000,IPO,28,8,5
1,SpaceX,100.0,2012-12-01,United States,Hawthorne,Other,"Founders Fund, Draper Fisher Jurvetson, Rothen...",2002,6.000,,29,12,
2,Stripe,95.0,2014-01-23,United States,San Francisco,Fintech,"Khosla Ventures, LowercaseCapital, capitalG",2010,2.000,Asset,39,12,1
3,Klarna,45.0,2011-12-12,Sweden,Stockholm,Fintech,"Institutional Venture Partners, Sequoia Capita...",2005,3.000,Acquired,56,13,1
4,Epic Games,42.0,2018-10-26,United States,Cary,Other,"Tencent Holdings, KKR, Smash Ventures",1991,4.000,Acquired,25,5,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1032,Timescale,1.0,2022-02-22,United States,New York,Internet software & services,"New Enterprise Associates, Benchmark, Two Sigm...",2015,0.181,,7,2,
1033,Scalapay,1.0,2022-02-23,Italy,Milan,Fintech,"Fasanara Capital, Tiger Global Management, Bal...",2019,0.700,,10,2,
1034,Omada Health,1.0,2022-02-23,United States,San Francisco,Health,"U.S. Venture Partners, dRx Capital, Andreessen...",2011,0.449,,30,6,
1035,BlueVoyant,1.0,2022-02-23,United States,New York,Cybersecurity,"8VC, Liberty Strategic Capital, Eden Global Pa...",2017,0.525,,6,2,


In [22]:
# determine if there are any full row duplicates
duplicateRows = df_unicorn_companies[df_unicorn_companies.duplicated()]
# None found
display(duplicateRows)

Unnamed: 0,Company,Valuation ($B),Date Joined,Country,City,Industry,Select Investors,Founded Year,Total Raised($B),Financial Stage,Investors Count,Deal Terms,Portfolio Exits


In [23]:
# determine if there are any unicorn company name duplicates
duplicateRows = df_unicorn_companies[df_unicorn_companies.duplicated(['Company'])]
display(duplicateRows['Company'])

63       Bolt
976    Fabric
Name: Company, dtype: object

In [24]:
# determine if there are Company Name duplicates 
duplicateCompanies = df_unicorn_companies['Company'].isin(['Bolt','Fabric'])
display(df_unicorn_companies[duplicateCompanies])

Unnamed: 0,Company,Valuation ($B),Date Joined,Country,City,Industry,Select Investors,Founded Year,Total Raised($B),Financial Stage,Investors Count,Deal Terms,Portfolio Exits
37,Bolt,11.0,2021-10-08,United States,San Francisco,Fintech,"Activant Capital, Tribe Capital, General Atlantic",2014,0.963,,37,3,
63,Bolt,8.0,2018-05-29,Estonia,Tallinn,Auto & transportation,"Didi Chuxing, Diamler, TMT Investments",2013,1.0,,28,5,
578,Fabric,1.0,2022-02-24,United States,Bellevue,E-commerce & direct-to-consumer,"Redpoint Ventures, Norwest Venture Partners, S...",2017,0.292,,11,2,
976,Fabric,1.0,2021-10-26,United States,New York,"Supply chain, logistics, & delivery","Innovation Endeavors, Aleph, Temasek",1999,,,1,1,


In [25]:
# rename Bolt to 'Bolt Financial' at index 37
df_unicorn_companies.at[37,'Company'] = 'Bolt Financial'

In [26]:
# rename Fabric to 'Get Fabric' at index 976
df_unicorn_companies.at[976,'Company'] = 'Get Fabric'
# re-run search for company duplicates

In [27]:
# check country names for accurate grouping
countryname = df_unicorn_companies['Country'].sort_values().unique()
display(countryname)

array(['Argentina', 'Australia', 'Austria', 'Bahamas', 'Belgium',
       'Bermuda', 'Brazil', 'Canada', 'Chile', 'China', 'Colombia',
       'Croatia', 'Czech Republic', 'Denmark', 'Estonia', 'Finland',
       'France', 'Germany', 'Hong Kong', 'India', 'Indonesia', 'Ireland',
       'Israel', 'Italy', 'Japan', 'Lithuania', 'Luxembourg', 'Malaysia',
       'Mexico', 'Netherlands', 'Nigeria', 'Norway', 'Philippines',
       'Senegal', 'Singapore', 'South Africa', 'South Korea', 'Spain',
       'Sweden', 'Switzerland', 'Thailand', 'Turkey',
       'United Arab Emirates', 'United Kingdom', 'United States',
       'Vietnam'], dtype=object)

In [122]:
# check city names for accurate grouping
# if we look at the values for city where 'Select Investors' are NaN , it looks like the data is left shifted i.e city value is Industry value etc.
# it could be where City and Country are the same this issue arises i.e Hong Kong and Singapore 
# if country value --> copied to City --> city value copied to Industry Value --> Industry Value copied to Select Investors
nan_investors = df_unicorn_companies[(df_unicorn_companies['Select Investors'].isnull()) & (df_unicorn_companies['Country'] != 'China')]
# 'Country' --> 'City' , 'City' --> 'Industry' ,  Industry' --> 'Select Investors'  : Bahamas
shifted_columns = nan_investors[['Country','City','Industry','Select Investors']]
display(shifted_columns)

Unnamed: 0,Country,City,Industry,Select Investors
10,Bahamas,Fintech,"Sequoia Capital, Thoma Bravo, Softbank",
217,Singapore,Mobile & telecommunications,Kuang-Chi,
292,Hong Kong,Fintech,"Tiger Global Management, Tiger Brokers, DCM Ve...",
318,Singapore,E-commerce & direct-to-consumer,"Jungle Ventures, Accel, Venture Highway",
438,Singapore,Artificial intelligence,"Vision Plus Capital, GSR Ventures, ZhenFund",
639,Singapore,Artificial intelligence,"Hopu Investment Management, Boyu Capital, DC T...",
757,Singapore,E-commerce & direct-to-consumer,"500 Global, Rakuten Ventures, Golden Gate Vent...",
814,Hong Kong,Fintech,"Sequoia Capital China, ING, Alibaba Entreprene...",
882,Singapore,Internet software & services,"Sequoia Capital China, Shunwei Capital Partner...",
911,Singapore,Fintech,"Dragonfly Captial, Qiming Venture Partners, DS...",


In [None]:
# might use iterrows to copy from column to column?
for index, row in nan_investors.iterrows():
    print(index, ': ', row['Country'], 'value should be copied to', row['City'], 'value should be copied to', row['Industry'],'value should be copied to', row['Select Investors'])

In [None]:
# check city names for accurate grouping
countryname = df_unicorn_companies['City'].sort_values().unique()

for name in countryname: 
    print(name)

In [None]:
# check industry cvalues for grouping
countryname = df_unicorn_companies['Industry'].sort_values().unique()
display(countryname)

In [None]:
# check industry cvalues for grouping

df_unicorn_companies[df_unicorn_companies['Founded Year'].isna()]




## Analyse
***

In [None]:
# set an index column
df_unicorn_companies.set_index('Company',inplace=True)

In [None]:
# sort Valuation descending 
df_unicorn_companies.sort_values('Valuation ($B)', ascending=False, inplace=True)

In [None]:
# sort sum of Valuation descending buy country
df_unicorn_companies.groupby('Country')['Valuation ($B)'].sum().sort_values(ascending=False)

In [None]:
# sort sum of Valuation descending buy country
df_unicorn_companies.groupby('Industry')['Valuation ($B)'].sum().sort_values(ascending=False)

## Visualise
***