# Kieran Molloy UCDPA Project CIDAB 2022-01-18
***

## Environment Information
***
<table align="left">
<tr>
    <th>Environment Type</th>
    <th>Anaconda Version</th>
    <th>Anaconda Build Channel</th>
    <th>Python Version</th>
</tr>
<tr>
    <td>Anaconda </td>
    <td>2021.11 </td>
    <td>py39_0 </td>
    <td>3.9.7 </td>
</tr>
</table>    

## Datasource
***
<table align="left">
<tr>
    <td>https://www.kaggle.com/datasets/deepcontractor/unicorn-companies-dataset?select=Unicorn_Companies.csv</td>
</tr>
</table>

## Import Required Modules
***

In [None]:
#!pip install geonamescache
#!pip install geotext

#!pip install geopy

In [None]:
from matplotlib import pyplot as plt
from IPython.display import display

import pandas as pd
import numpy as np
import seaborn as sns
import re

import geonamescache #https://pypi.org/project/geonamescache/
from geotext import GeoText

## Variables
***

In [None]:
csv = 'Unicorn_Companies.csv'

## Functions
***

In [None]:
def fn_set_float(value):
    """remove non numeric string and convert numeric to float """
    if value == 'None' or value == 'NaN':
      pass
    elif re.search('[Mm]+',value):
        stripped = float(re.search(r'\d+',value).group())
        return stripped/1000
    elif re.search('[Kk]+',value):
        stripped = float(re.search(r'\d+',value).group())
        return stripped/100000
    else:
        stripped = float(re.search(r'\d+',value).group())
        return stripped

In [None]:
def fn_set_int(value):
    """convert float to int"""
    if value == 'None' or value == 'NaN':
      pass
    else:
        toint = int(value)
        return toint

In [None]:
def converttobillions(value):
      
    pass

In [None]:
def function4():
    
    pass

## Initial CSV import 
***

In [None]:
# read the CSV from the local Jupyter Notebook directory 
df_unicorn_companies_initial = pd.read_csv(csv)

## Inspect Data
***

In [None]:
display(df_unicorn_companies_initial.head(10))

In [None]:
# all columns are imported as object datatypes
print(df_unicorn_companies_initial.info())

In [None]:
# determine if there are any isna missing values , none found
print(df_unicorn_companies_initial.isna().any())

In [None]:
# determine if there are any isnull missing values , none found
print(df_unicorn_companies_initial.isnull().sum())

## Import CSV Data
***

In [None]:
# read_csv with some data manupilation on import , remove unwanted characters, change datatypes , change 'None' to na_values
df_unicorn_companies = pd.read_csv(csv, parse_dates=['Date Joined'], na_values='None',converters={'Valuation ($B)':fn_set_float,'Total Raised': fn_set_float}, dtype={'Founded Year': 'Int64','Investors Count': 'Int64','Deal Terms': 'Int64','Portfolio Exits': 'Int64'})

In [None]:
# re-check datatypes , all seem preferred
print(df_unicorn_companies.info())

In [None]:
# re-check for NaN / None
print(df_unicorn_companies.isna().any())

In [None]:
# re-check count of NaN / None
df_unicorn_companies.isnull().sum()

In [None]:
# using display instead of print for better formatting in Notebook only .. understood print is preferable in commandline
display(df_unicorn_companies)

In [None]:
df_unicorn_companies.describe

## Clean Data
***

In [None]:
# get the index and row label of the Dataframe
print(df_unicorn_companies.index)

In [None]:
# using display instead for print for better formatting in Notebook only .. understood print is preferable in commandline
# Company column showing as Index 
display(df_unicorn_companies)

In [None]:
# rename 'Select Inverstors' column name typo and 'Total Raised' added $B to column name in Dataframe 
df_unicorn_companies.rename(columns = {'Select Inverstors':'Select Investors','Total Raised':'Total Raised($B)'}, inplace = True)
# show both renames successful
display(df_unicorn_companies)

In [None]:
# determine if there are any full row duplicates
duplicateRows = df_unicorn_companies[df_unicorn_companies.duplicated()]
# None found
display(duplicateRows)

In [None]:
# determine if there are any unicorn company name duplicates
duplicateRows = df_unicorn_companies[df_unicorn_companies.duplicated(['Company'])]
display(duplicateRows['Company'])

In [None]:
# determine if there are Company Name duplicates 
duplicateCompanies = df_unicorn_companies['Company'].isin(['Bolt','Fabric'])
display(df_unicorn_companies[duplicateCompanies])

In [None]:
# rename Bolt to 'Bolt Financial' at index 37
df_unicorn_companies.at[37,'Company'] = 'Bolt Financial'

In [None]:
# rename Fabric to 'Get Fabric' at index 976
df_unicorn_companies.at[976,'Company'] = 'Get Fabric'
# re-run search for company duplicates

In [None]:
df_unicorn_companies

In [None]:
# check country names for spelling for accurate grouping
countryname = df_unicorn_companies['Country'].sort_values().unique()
display(countryname)

In [None]:
df_unicorn_companies[(df_unicorn_companies['Select Investors'].isnull()) & (~df_unicorn_companies['Country'].isin(['China','Bahamas']))]

In [None]:
# check city names for accurate grouping
# if we look at the values for city where 'Select Investors' are NaN , it looks like the data is left shifted i.e city value is Industry value etc.
# it could be where City and Country are the same this issue arises i.e Hong Kong and Singapore : China and Bahamas are excepetions
# if country value --> copied to City --> city value copied to Industry Value --> Industry Value copied to Select Investors
nan_investors = df_unicorn_companies[(df_unicorn_companies['Select Investors'].isnull()) & (~df_unicorn_companies['Country'].isin(['China','Bahamas']))]
# 'Country' --> 'City' , 'City' --> 'Industry' ,  Industry' --> 'Select Investors'
display(nan_investors)

In [None]:
# country - Bahamas , city - Nassau
# country - Singapore , city - Singapore
# country - Hong Kong , city - Hong Kong
# change by country 

# do Singapore and Hong Kong first
indexto_shift = shifted_columns[shifted_columns['Country'].isin(['Singapore','Hong Kong'])].index
indexto_shift

In [None]:
# Copy 'Industry' to 'Select Investors' 
for i in indexto_shift:
    df_unicorn_companies.at[i,'Select Investors'] = df_unicorn_companies.at[i,'Industry']
# Copy 'City' to 'Industry'
for i in indexto_shift:
    df_unicorn_companies.at[i,'Industry'] = df_unicorn_companies.at[i,'City']
# Copy 'Country' to 'City'  
for i in indexto_shift:
    df_unicorn_companies.at[i,'City'] = df_unicorn_companies.at[i,'Country']

In [None]:
# check singapore and Hong Kong are ok 
df_unicorn_companies[(df_unicorn_companies['Country'].isin(['Singapore','Hong Kong']))]
# Country 	City 	Industry 	Select Investors are all correct now. 

In [None]:
df_unicorn_companies[(df_unicorn_companies['Select Investors'].isnull()) & (df_unicorn_companies['Country'].isin(['China','Bahamas']))]

In [None]:
# googled investors for index 789 , company: LinkSure Network - 'Northern Light Venture Capital' 
df_unicorn_companies.at[789,'Select Investors'] = 'Northern Light Venture Capital'

In [None]:
# next to update is index 10  , company: FTX 
# Country 	City 	Industry 	

i = 10
# Copy 'Industry' to 'Select Investors' 
df_unicorn_companies.at[i,'Select Investors'] = df_unicorn_companies.at[i,'Industry']

# Copy 'City' to 'Industry'
df_unicorn_companies.at[i,'Industry'] = df_unicorn_companies.at[i,'City']

# set City to Nassau
df_unicorn_companies.at[i,'City'] = 'Nassau'

In [None]:
# check if the above changes have been effective 
df_unicorn_companies[df_unicorn_companies['Country'].isin(['China','Bahamas','Singapore','Hong Kong'])]
# Country 	City 	Industry 	Select Investors are all correct now. 

In [None]:
# check country names for spelling for accurate grouping
industryname = df_unicorn_companies['Industry'].sort_values().unique()
display(industryname)
#  shows Fintech and AI have two entries each

In [None]:
# check country names for spelling for accurate grouping
industryname = df_unicorn_companies['Industry'].sort_values().unique()
display(industryname)
#  shows Fintech and AI have spelling mistakes

In [None]:
# df_unicorn_companies[df_unicorn_companies['Industry'].isin(['Artificial Intelligence','Artificial intelligence'])]

df_unicorn_companies['Industry'] = df_unicorn_companies['Industry'].replace(to_replace=['Artificial Intelligence','Artificial intelligence'],  value='Artificial Intelligence')
df_unicorn_companies['Industry'] = df_unicorn_companies['Industry'].replace(to_replace=['Fintech','Finttech'],  value='Fintech')

In [None]:
df_unicorn_companies[df_unicorn_companies['Industry'].isin(['Artificial Intelligence','Artificial intelligence','Fintech','Finttech'])]
# 'Artificial Intelligence','Artificial intelligence','Fintech','Finttech' have been replaced with 'Fintech' and 'Artificial Intelligence'

In [None]:
df_unicorn_companies[df_unicorn_companies['Founded Year'].isnull()]

In [None]:
for index, row in nan_investors.iterrows():
    print(index, ': ', row['Country'], 'value should be copied to', row['City'], 'value should be copied to', row['Industry'],'value should be copied to', row['Select Investors'])

In [None]:
# check city names for accurate grouping
countryname = df_unicorn_companies['City'].sort_values().unique()

for name in countryname: 
    print(name)

In [None]:
# check industry cvalues for grouping
countryname = df_unicorn_companies['Industry'].sort_values().unique()
display(countryname)

In [None]:
# check industry cvalues for grouping
df_unicorn_companies[df_unicorn_companies['Founded Year'].isna()]

## Analyse
***

In [None]:
# set an index column
df_unicorn_companies.set_index('Company',inplace=True)

In [None]:
# sort Valuation descending 
df_unicorn_companies.sort_values('Valuation ($B)', ascending=False, inplace=True)

In [None]:
# sort sum of Valuation descending buy country
df_unicorn_companies.groupby('Country')['Valuation ($B)'].sum().sort_values(ascending=False)

In [None]:
# sort sum of Valuation descending buy country
df_unicorn_companies.groupby('Industry')['Valuation ($B)'].sum().sort_values(ascending=False)

## Visualise
***