# Kieran Molloy UCDPA Project CIDAB 2022-01-18
***

## Environment Information
***
<table align="left">
<tr>
    <th>Environment Type</th>
    <th>Anaconda Version</th>
    <th>Anaconda Build Channel</th>
    <th>Python Version</th>
</tr>
<tr>
    <td>Anaconda </td>
    <td>2021.11 </td>
    <td>py39_0 </td>
    <td>3.9.7 </td>
</tr>
</table>    

## Datasource
***
<table align="left">
<tr>
    <td>https://www.kaggle.com/datasets/deepcontractor/unicorn-companies-dataset?select=Unicorn_Companies.csv</td>
</tr>
</table>

## Import Required Modules
***

In [None]:
#!pip install geonamescache
#!pip install geotext

#!pip install geopy

In [None]:
from matplotlib import pyplot as plt
from IPython.display import display

import pandas as pd
import numpy as np
import seaborn as sns
import re

import geonamescache #https://pypi.org/project/geonamescache/
from geotext import GeoText

## Variables
***

In [None]:
csv = 'Unicorn_Companies.csv'

## Functions
***

In [None]:
def fn_set_float(value):
    """remove non numeric string and convert numeric to float """
    if value == 'None' or value == 'NaN':
      pass
    elif re.search('[Mm]+',value):
        stripped = float(re.search(r'\d+',value).group())
        return stripped/1000
    elif re.search('[Kk]+',value):
        stripped = float(re.search(r'\d+',value).group())
        return stripped/100000
    else:
        stripped = float(re.search(r'\d+',value).group())
        return stripped

In [None]:
def fn_set_int(value):
    """convert float to int"""
    if value == 'None' or value == 'NaN':
      pass
    else:
        toint = int(value)
        return toint

In [None]:
def converttobillions(value):
      
    pass

In [None]:
def function4():
    
    pass

## Initial CSV import 
***

In [None]:
# read the CSV from the local Jupyter Notebook directory 
df_unicorn_companies_initial = pd.read_csv(csv)

## Inspect Data
***

In [None]:
display(df_unicorn_companies_initial.head(10))

In [None]:
# all columns are imported as object datatypes
print(df_unicorn_companies_initial.info())

In [None]:
# determine if there are any isna missing values , none found
print(df_unicorn_companies_initial.isna().any())

In [None]:
# determine if there are any isnull missing values , none found
print(df_unicorn_companies_initial.isnull().sum())

## Import CSV Data
***

In [None]:
# read_csv with some data manupilation on import , remove unwanted characters, change datatypes , change 'None' to na_values
df_unicorn_companies = pd.read_csv(csv, parse_dates=['Date Joined'], na_values='None',converters={'Valuation ($B)':fn_set_float,'Total Raised': fn_set_float}, dtype={'Founded Year': 'Int64','Investors Count': 'Int64','Deal Terms': 'Int64','Portfolio Exits': 'Int64'})

In [None]:
# re-check datatypes , all seem preferred
print(df_unicorn_companies.info())

In [None]:
# re-check for NaN / None
print(df_unicorn_companies.isna().any())

In [None]:
# re-check count of NaN / None
df_unicorn_companies.isnull().sum()

In [None]:
# using display instead of print for better formatting in Notebook only .. understood print is preferable in commandline
display(df_unicorn_companies)

## Clean Data
***

In [None]:
# get the index and row label of the Dataframe
print(df_unicorn_companies.index)

In [None]:
# using display instead for print for better formatting in Notebook only .. understood print is preferable in commandline
# Company column showing as Index 
display(df_unicorn_companies)

In [None]:
# rename 'Select Inverstors' column name typo and 'Total Raised' added $B to column name in Dataframe 
df_unicorn_companies.rename(columns = {'Select Inverstors':'Select Investors','Total Raised':'Total Raised($B)'}, inplace = True)
# show both renames successful
display(df_unicorn_companies)

In [None]:
# determine if there are any full row duplicates
duplicateRows = df_unicorn_companies[df_unicorn_companies.duplicated()]
# None found
display(duplicateRows)

In [None]:
# determine if there are any unicorn company name duplicates
duplicateRows = df_unicorn_companies[df_unicorn_companies.duplicated(['Company'])]
display(duplicateRows['Company'])

In [None]:
# determine if there are Company Name duplicates 
duplicateCompanies = df_unicorn_companies['Company'].isin(['Bolt','Fabric'])
display(df_unicorn_companies[duplicateCompanies])

In [None]:
# rename Bolt to 'Bolt Financial' at index 37
df_unicorn_companies.at[37,'Company'] = 'Bolt Financial'

In [None]:
# rename Fabric to 'Get Fabric' at index 976
df_unicorn_companies.at[976,'Company'] = 'Get Fabric'
# re-run search for company duplicates

In [109]:
df_unicorn_companies

Unnamed: 0,Company,Valuation ($B),Date Joined,Country,City,Industry,Select Investors,Founded Year,Total Raised($B),Financial Stage,Investors Count,Deal Terms,Portfolio Exits
0,Bytedance,140.0,2017-04-07,China,Beijing,Artificial intelligence,"Sequoia Capital China, SIG Asia Investments, S...",2012,7.000,IPO,28,8,5
1,SpaceX,100.0,2012-12-01,United States,Hawthorne,Other,"Founders Fund, Draper Fisher Jurvetson, Rothen...",2002,6.000,,29,12,
2,Stripe,95.0,2014-01-23,United States,San Francisco,Fintech,"Khosla Ventures, LowercaseCapital, capitalG",2010,2.000,Asset,39,12,1
3,Klarna,45.0,2011-12-12,Sweden,Stockholm,Fintech,"Institutional Venture Partners, Sequoia Capita...",2005,3.000,Acquired,56,13,1
4,Epic Games,42.0,2018-10-26,United States,Cary,Other,"Tencent Holdings, KKR, Smash Ventures",1991,4.000,Acquired,25,5,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1032,Timescale,1.0,2022-02-22,United States,New York,Internet software & services,"New Enterprise Associates, Benchmark, Two Sigm...",2015,0.181,,7,2,
1033,Scalapay,1.0,2022-02-23,Italy,Milan,Fintech,"Fasanara Capital, Tiger Global Management, Bal...",2019,0.700,,10,2,
1034,Omada Health,1.0,2022-02-23,United States,San Francisco,Health,"U.S. Venture Partners, dRx Capital, Andreessen...",2011,0.449,,30,6,
1035,BlueVoyant,1.0,2022-02-23,United States,New York,Cybersecurity,"8VC, Liberty Strategic Capital, Eden Global Pa...",2017,0.525,,6,2,


In [None]:
# check country names for spelling for accurate grouping
countryname = df_unicorn_companies['Country'].sort_values().unique()
display(countryname)

In [None]:
df_unicorn_companies[(df_unicorn_companies['Select Investors'].isnull()) & (~df_unicorn_companies['Country'].isin(['China','Bahamas']))]

In [None]:
# check city names for accurate grouping
# if we look at the values for city where 'Select Investors' are NaN , it looks like the data is left shifted i.e city value is Industry value etc.
# it could be where City and Country are the same this issue arises i.e Hong Kong and Singapore : China and Bahamas are excepetions
# if country value --> copied to City --> city value copied to Industry Value --> Industry Value copied to Select Investors
nan_investors = df_unicorn_companies[(df_unicorn_companies['Select Investors'].isnull()) & (~df_unicorn_companies['Country'].isin(['China','Bahamas']))]
# 'Country' --> 'City' , 'City' --> 'Industry' ,  Industry' --> 'Select Investors'
display(nan_investors)

In [None]:
# country - Bahamas , city - Nassau
# country - Singapore , city - Singapore
# country - Hong Kong , city - Hong Kong
# change by country 

# do Singapore and Hong Kong first
indexto_shift = shifted_columns[shifted_columns['Country'].isin(['Singapore','Hong Kong'])].index
indexto_shift

In [None]:
# Copy 'Industry' to 'Select Investors' 
for i in indexto_shift:
    df_unicorn_companies.at[i,'Select Investors'] = df_unicorn_companies.at[i,'Industry']
# Copy 'City' to 'Industry'
for i in indexto_shift:
    df_unicorn_companies.at[i,'Industry'] = df_unicorn_companies.at[i,'City']
# Copy 'Country' to 'City'  
for i in indexto_shift:
    df_unicorn_companies.at[i,'City'] = df_unicorn_companies.at[i,'Country']

In [None]:
# check singapore and Hong Kong are ok 
df_unicorn_companies[(df_unicorn_companies['Country'].isin(['Singapore','Hong Kong']))]
# Country 	City 	Industry 	Select Investors are all correct now. 

In [108]:
df_unicorn_companies[(df_unicorn_companies['Select Investors'].isnull()) & (df_unicorn_companies['Country'].isin(['China','Bahamas']))]

Unnamed: 0,Company,Valuation ($B),Date Joined,Country,City,Industry,Select Investors,Founded Year,Total Raised($B),Financial Stage,Investors Count,Deal Terms,Portfolio Exits


In [105]:
# googled investors for index 789 , company: LinkSure Network - 'Northern Light Venture Capital' 
df_unicorn_companies.at[789,'Select Investors'] = 'Northern Light Venture Capital'

In [None]:
# next to update is index 10  , company: FTX 
# Country 	City 	Industry 	

i = 10
# Copy 'Industry' to 'Select Investors' 
df_unicorn_companies.at[i,'Select Investors'] = df_unicorn_companies.at[i,'Industry']

# Copy 'City' to 'Industry'
df_unicorn_companies.at[i,'Industry'] = df_unicorn_companies.at[i,'City']

# set City to Nassau
df_unicorn_companies.at[i,'City'] = 'Nassau'

In [111]:
# check if the above changes have been effective 
df_unicorn_companies[df_unicorn_companies['Country'].isin(['China','Bahamas','Singapore','Hong Kong'])]
# Country 	City 	Industry 	Select Investors are all correct now. 

Unnamed: 0,Company,Valuation ($B),Date Joined,Country,City,Industry,Select Investors,Founded Year,Total Raised($B),Financial Stage,Investors Count,Deal Terms,Portfolio Exits
0,Bytedance,140.0,2017-04-07,China,Beijing,Artificial intelligence,"Sequoia Capital China, SIG Asia Investments, S...",2012,7.000,IPO,28,8,5
10,FTX,32.0,2021-07-20,Bahamas,Nassau,Fintech,"Sequoia Capital, Thoma Bravo, Softbank",2018,1.000,Acq,40,3,1
14,Xiaohongshu,20.0,2016-03-31,China,Shanghai,E-commerce & direct-to-consumer,"GGV Capital, ZhenFund, Tencent",2013,0.917,,9,3,
17,Yuanfudao,15.0,2017-05-31,China,Beijing,Edtech,"Tencent Holdings, Warbug Pincus, IDG Capital",2012,4.000,Acquired,18,7,1
18,DJI Innovations,15.0,2015-01-23,China,Shenzhen,Hardware,"Accel Partners, Sequoia Capital",2006,1.000,,7,3,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
978,Moka,1.0,2021-11-02,China,Beijing,Internet software & services,"GGV Capital, GSR Ventures, FreesFund",2015,0.142,,8,2,
979,Tezign,1.0,2021-11-02,China,Shanghai,Internet software & services,"Sequoia Capital China, Linear Venture, Hearst ...",2015,0.110,,14,1,
986,Gaussian Robotics,1.0,2021-11-10,China,Shanghai,Hardware,"BlueRun Ventures, Grand Flight Investment, Mei...",2013,0.287,,17,1,
1011,Haomao.AI,1.0,2021-12-22,China,Beijing,Artificial Intelligence,"Qualcomm Ventures, Nine Intelligence Capital, ...",2019,0.203,,7,1,


In [124]:
# check country names for spelling for accurate grouping
industryname = df_unicorn_companies['Industry'].sort_values().unique()
display(industryname)
#  shows Fintech and AI have two entries each

array(['Artificial Intelligence', 'Auto & transportation',
       'Consumer & retail', 'Cybersecurity',
       'Data management & analytics', 'E-commerce & direct-to-consumer',
       'Edtech', 'Fintech', 'Hardware', 'Health',
       'Internet software & services', 'Mobile & telecommunications',
       'Other', 'Supply chain, logistics, & delivery', 'Travel'],
      dtype=object)

In [None]:
# check country names for spelling for accurate grouping
industryname = df_unicorn_companies['Industry'].sort_values().unique()
display(industryname)
#  shows Fintech and AI have spelling 

In [122]:
# df_unicorn_companies[df_unicorn_companies['Industry'].isin(['Artificial Intelligence','Artificial intelligence'])]

df_unicorn_companies['Industry'] = df_unicorn_companies['Industry'].replace(to_replace=['Artificial Intelligence','Artificial intelligence'],  value='Artificial Intelligence')
df_unicorn_companies['Industry'] = df_unicorn_companies['Industry'].replace(to_replace=['Fintech','Finttech'],  value='Fintech')

In [123]:
df_unicorn_companies[df_unicorn_companies['Industry'].isin(['Artificial Intelligence','Artificial intelligence','Fintech','Finttech'])]
# 'Artificial Intelligence','Artificial intelligence','Fintech','Finttech' have been replaced with 'Fintech' and 'Artificial Intelligence'

Unnamed: 0,Company,Valuation ($B),Date Joined,Country,City,Industry,Select Investors,Founded Year,Total Raised($B),Financial Stage,Investors Count,Deal Terms,Portfolio Exits
0,Bytedance,140.0,2017-04-07,China,Beijing,Artificial Intelligence,"Sequoia Capital China, SIG Asia Investments, S...",2012,7.000,IPO,28,8,5
2,Stripe,95.0,2014-01-23,United States,San Francisco,Fintech,"Khosla Ventures, LowercaseCapital, capitalG",2010,2.000,Asset,39,12,1
3,Klarna,45.0,2011-12-12,Sweden,Stockholm,Fintech,"Institutional Venture Partners, Sequoia Capita...",2005,3.000,Acquired,56,13,1
6,Checkout.com,40.0,2019-05-02,United Kingdom,London,Fintech,"Tiger Global Management, Insight Partners, DST...",2012,1.000,,15,4,
9,Revolut,33.0,2018-04-26,United Kingdom,London,Fintech,"index Ventures, DST Global, Ribbit Capital",2015,1.000,,31,6,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1018,Placer.ai,1.0,2022-01-12,United States,Los Altos,Artificial Intelligence,"Fifth Wall Ventures, JBV Capital, Array Ventures",2016,0.166,,19,1,
1023,Esusu,1.0,2022-01-27,United States,New York,Fintech,"Next Play Ventures, Zeal Capital Partners, Sof...",2015,0.144,,25,1,
1024,Betterfly,1.0,2022-02-01,Chile,Santiago,Artificial Intelligence,"QED Investors, DST Global, Endeavor",2018,0.202,,13,1,
1027,Payhawk,1.0,2022-02-14,United Kingdom,London,Fintech,"Earlybird Venture Capital, Eleven Ventures, QE...",2018,0.236,,15,2,


In [129]:
df_unicorn_companies[df_unicorn_companies['Founded Year'].isnull()]

Unnamed: 0,Company,Valuation ($B),Date Joined,Country,City,Industry,Select Investors,Founded Year,Total Raised($B),Financial Stage,Investors Count,Deal Terms,Portfolio Exits
39,Weilong,10.0,2021-05-08,China,Luohe,Consumer & retail,"Tencent Holdings, Hillhouse Capital Management...",,0.559,,7.0,1.0,
74,Hopin,7.0,2020-11-10,United Kingdom,London,Internet software & services,"Accel, Northzone Ventures, Institutional Ventu...",,0.671,,85.0,3.0,
81,Argo AI,7.0,2019-07-12,United States,Pittsburgh,Artificial Intelligence,"Volkswagen Group, Ford Autonomous Vehicles",,0.5,,2.0,1.0,
94,Ola Cabs,7.0,2014-10-27,India,Bengaluru,Auto & transportation,"Accel Partners, SoftBank Group, Sequoia Capital",,,,8.0,,
124,C6 Bank,5.0,2020-12-02,Brazil,Sao Paulo,Fintech,Credit Suisse,,0.755,,16.0,6.0,
146,Chipone,4.0,2021-12-20,China,Beijing,Hardware,"China Grand Prosperity Investment, Silk Road H...",,1.0,,30.0,1.0,
170,SSENSE,4.0,2021-06-08,Canada,Montreal,E-commerce & direct-to-consumer,Sequoia Capital,,0.415,,12.0,2.0,
175,CRED,4.0,2021-04-06,India,Bengaluru,Fintech,"Tiger Global Management, DST Global, Sequoia C...",,0.613,,18.0,5.0,
184,Zapier,4.0,2021-01-14,United States,Sunnyvale,Internet software & services,"Sequoia Capital, Bessemer Venture Partners, Th...",,0.002,,6.0,1.0,
185,Clubhouse,4.0,2021-01-24,United States,San Francisco,Mobile & telecommunications,"Andreessen Horowitz, TQ Ventures",,0.002,,6.0,1.0,


In [103]:
for index, row in nan_investors.iterrows():
    print(index, ': ', row['Country'], 'value should be copied to', row['City'], 'value should be copied to', row['Industry'],'value should be copied to', row['Select Investors'])

217 :  Singapore value should be copied to Mobile & telecommunications value should be copied to Kuang-Chi value should be copied to nan
292 :  Hong Kong value should be copied to Fintech value should be copied to Tiger Global Management, Tiger Brokers, DCM Ventures value should be copied to nan
318 :  Singapore value should be copied to E-commerce & direct-to-consumer value should be copied to Jungle Ventures, Accel, Venture Highway value should be copied to nan
438 :  Singapore value should be copied to Artificial intelligence value should be copied to Vision Plus Capital, GSR Ventures, ZhenFund value should be copied to nan
639 :  Singapore value should be copied to Artificial intelligence value should be copied to Hopu Investment Management, Boyu Capital, DC Thomson Ventures value should be copied to nan
757 :  Singapore value should be copied to E-commerce & direct-to-consumer value should be copied to 500 Global, Rakuten Ventures, Golden Gate Ventures value should be copied to na

In [None]:
# check city names for accurate grouping
countryname = df_unicorn_companies['City'].sort_values().unique()

for name in countryname: 
    print(name)

In [None]:
# check industry cvalues for grouping
countryname = df_unicorn_companies['Industry'].sort_values().unique()
display(countryname)

In [None]:
# check industry cvalues for grouping
df_unicorn_companies[df_unicorn_companies['Founded Year'].isna()]

## Analyse
***

In [None]:
# set an index column
df_unicorn_companies.set_index('Company',inplace=True)

In [None]:
# sort Valuation descending 
df_unicorn_companies.sort_values('Valuation ($B)', ascending=False, inplace=True)

In [None]:
# sort sum of Valuation descending buy country
df_unicorn_companies.groupby('Country')['Valuation ($B)'].sum().sort_values(ascending=False)

In [None]:
# sort sum of Valuation descending buy country
df_unicorn_companies.groupby('Industry')['Valuation ($B)'].sum().sort_values(ascending=False)

## Visualise
***