**Scikit Learn**

In [57]:
import sklearn    # machine learning
import numpy as np  # computation
import pandas as pd # data management

In [74]:
# 1. load the data set
dataset = pd.read_csv("https://raw.githubusercontent.com/kit-sereyvath/Machine-Learning--Data-Wrangling/main/tech_fundings.csv")
dataset.head(10)

Unnamed: 0,Company,Website,Region,Vertical,Funding Amount (USD),Funding Stage,Funding Date
0,Internxt,https://internxt.com/,Spain,Blockchain,278940.0,Seed,20-Jan
1,Dockflow,https://dockflow.com,Belgium,Logistics,292244.0,Seed,20-Jan
2,api.video,https://api.video,France,Developer APIs,300000.0,Seed,20-Jan
3,Buck.ai,https://buck.ai/,United States,Artificial Intelligence,300000.0,Seed,20-Jan
4,Prodsight,https://www.prodsight.ai,United Kingdom,Artificial Intelligence,529013.0,Seed,20-Jan
5,Harvestr,http://harvestr.io/,France,B2B Software,650000.0,Angel,20-Jan
6,Seatrec,http://seatrec.com/,United States,Marine Technology,970000.0,Seed,20-Jan
7,BeeCanvas,https://beecanvas.com/en,United States,B2B Software,1000000.0,Seed,20-Jan
8,SmartKarrot,https://www.smartkarrot.com/,United States,B2B Software,1000000.0,Angel,20-Jan
9,Prisync,https://prisync.com,Turkey,B2B Software,1100000.0,Seed,20-Jan


############################################

In [59]:
# Dimension: row, column
dataset.shape

(3575, 7)

In [60]:
dataset.describe()

Unnamed: 0,Funding Amount (USD)
count,3566.0
mean,57560140.0
std,298197600.0
min,40000.0
25%,5000000.0
50%,15496300.0
75%,50000000.0
max,16600000000.0


################################################################################

In [61]:
# Check the variable (numerical / categorical)
numerical_variable = dataset.columns[dataset.dtypes != 'object']  # looking for numerical data
dataset[numerical_variable].head(5)

Unnamed: 0,Funding Amount (USD)
0,278940.0
1,292244.0
2,300000.0
3,300000.0
4,529013.0


In [62]:
# Check the variable (numerical / categorical)
categorical_variable = dataset.columns[dataset.dtypes == 'object']  # looking for categorical data
dataset[categorical_variable].head(5)

Unnamed: 0,Company,Website,Region,Vertical,Funding Stage,Funding Date
0,Internxt,https://internxt.com/,Spain,Blockchain,Seed,20-Jan
1,Dockflow,https://dockflow.com,Belgium,Logistics,Seed,20-Jan
2,api.video,https://api.video,France,Developer APIs,Seed,20-Jan
3,Buck.ai,https://buck.ai/,United States,Artificial Intelligence,Seed,20-Jan
4,Prodsight,https://www.prodsight.ai,United Kingdom,Artificial Intelligence,Seed,20-Jan


################################################################################

In [63]:
# Missing value
dataset.isnull().head(5)    # in province column is true because it contain NaN which means no data

# False = no missing value
# True = have missing value

Unnamed: 0,Company,Website,Region,Vertical,Funding Amount (USD),Funding Stage,Funding Date
0,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False


################################################################################

In [64]:
# Find the percentage of missing value in numerical variable
dataset[numerical_variable].isnull().sum() / len(dataset)

Funding Amount (USD)    0.002517
dtype: float64

In [65]:
# Find the percentage of missing value in categorical variable
dataset[categorical_variable].isnull().sum() / len(dataset)

Company          0.000000
Website          0.000000
Region           0.003357
Vertical         0.000000
Funding Stage    0.000000
Funding Date     0.000000
dtype: float64

################################################################################

In [66]:
# Replace missing value
dataset[categorical_variable].fillna("Cambodia")

Unnamed: 0,Company,Website,Region,Vertical,Funding Stage,Funding Date
0,Internxt,https://internxt.com/,Spain,Blockchain,Seed,20-Jan
1,Dockflow,https://dockflow.com,Belgium,Logistics,Seed,20-Jan
2,api.video,https://api.video,France,Developer APIs,Seed,20-Jan
3,Buck.ai,https://buck.ai/,United States,Artificial Intelligence,Seed,20-Jan
4,Prodsight,https://www.prodsight.ai,United Kingdom,Artificial Intelligence,Seed,20-Jan
...,...,...,...,...,...,...
3570,Headout,https://www.headout.com,United States,Travel,Series B,21-Sep
3571,OLIO,https://olioex.com/,United Kingdom,Wellness,Series B,21-Sep
3572,Trees for Life,https://treesforlife.org.uk,United Kingdom,Wellness,Crowdfunding,21-Sep
3573,Britishvolt,https://britishvolt.com/,United Kingdom,Wellness,Series B,21-Sep


In [67]:
# Replace missing value
dataset[numerical_variable].fillna(111)

Unnamed: 0,Funding Amount (USD)
0,278940.0
1,292244.0
2,300000.0
3,300000.0
4,529013.0
...,...
3570,12000000.0
3571,43000000.0
3572,2737468.0
3573,70000000.0


In [68]:
# Replace value
dataset['Region'].replace("Cambodia", "Unknown")

0                Spain
1              Belgium
2               France
3        United States
4       United Kingdom
             ...      
3570     United States
3571    United Kingdom
3572    United Kingdom
3573    United Kingdom
3574            France
Name: Region, Length: 3575, dtype: object

################################################################################

In [69]:
# Insert new row into the dataset
row_insert = pd.Series(['Tour Arista', 'https://tourarista.com/', 'Cambodia', 'Tourism', 100000, 'Seed', '12-Dec'], index = ['Company', 'Website', 'Region', 'Vertical', 'Funding Amount (USD)', 'Funding Stage', 'Funding Date'])
dataset.append(row_insert, ignore_index=True)

Unnamed: 0,Company,Website,Region,Vertical,Funding Amount (USD),Funding Stage,Funding Date
0,Internxt,https://internxt.com/,Spain,Blockchain,278940.0,Seed,20-Jan
1,Dockflow,https://dockflow.com,Belgium,Logistics,292244.0,Seed,20-Jan
2,api.video,https://api.video,France,Developer APIs,300000.0,Seed,20-Jan
3,Buck.ai,https://buck.ai/,United States,Artificial Intelligence,300000.0,Seed,20-Jan
4,Prodsight,https://www.prodsight.ai,United Kingdom,Artificial Intelligence,529013.0,Seed,20-Jan
...,...,...,...,...,...,...,...
3571,OLIO,https://olioex.com/,United Kingdom,Wellness,43000000.0,Series B,21-Sep
3572,Trees for Life,https://treesforlife.org.uk,United Kingdom,Wellness,2737468.0,Crowdfunding,21-Sep
3573,Britishvolt,https://britishvolt.com/,United Kingdom,Wellness,70000000.0,Series B,21-Sep
3574,Greenly,https://www.greenly.earth/,France,Wellness,2953675.0,Seed,21-Sep


################################################################################

In [70]:
# Call specific row in dataset
dataset.iloc[111]
# dataset.iloc[3:7]

Company                                 SpotOn
Website                 https://www.spoton.com
Region                           United States
Vertical                               FinTech
Funding Amount (USD)                50000000.0
Funding Stage                         Series B
Funding Date                            20-Mar
Name: 111, dtype: object

In [71]:
# Call specific row in dataset based on value
dataset = dataset.set_index(dataset['Funding Date'])   # tell dataset which column we going to use in searching for matching value
dataset.loc['21-Sep']

Unnamed: 0_level_0,Company,Website,Region,Vertical,Funding Amount (USD),Funding Stage,Funding Date
Funding Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
21-Sep,AON3D,https://aon3d.com,Canada,3D,11500000.0,Series A,21-Sep
21-Sep,FLYR Labs,https://flyrlabs.com,United States,Aerospace,150000000.0,Series C,21-Sep
21-Sep,Avia Solutions Group,https://aviasg.com,Cyprus,Aerospace,351360000.0,Private Equity,21-Sep
21-Sep,Urban Aeronautics,https://www.urbanaero.com/,Israel,Aerospace,10000000.0,Series A,21-Sep
21-Sep,Near Space Labs,https://www.nearspacelabs.com,United States,Aerospace,13000000.0,Series A,21-Sep
...,...,...,...,...,...,...,...
21-Sep,Headout,https://www.headout.com,United States,Travel,12000000.0,Series B,21-Sep
21-Sep,OLIO,https://olioex.com/,United Kingdom,Wellness,43000000.0,Series B,21-Sep
21-Sep,Trees for Life,https://treesforlife.org.uk,United Kingdom,Wellness,2737468.0,Crowdfunding,21-Sep
21-Sep,Britishvolt,https://britishvolt.com/,United Kingdom,Wellness,70000000.0,Series B,21-Sep


################################################################################

In [76]:
# Rename column in dataset
dataset.rename(columns={'Funding Amount (USD)': 'FA (USD)'})

Unnamed: 0,Company,Website,Region,Vertical,FA (USD),Funding Stage,Funding Date
0,Internxt,https://internxt.com/,Spain,Blockchain,278940.0,Seed,20-Jan
1,Dockflow,https://dockflow.com,Belgium,Logistics,292244.0,Seed,20-Jan
2,api.video,https://api.video,France,Developer APIs,300000.0,Seed,20-Jan
3,Buck.ai,https://buck.ai/,United States,Artificial Intelligence,300000.0,Seed,20-Jan
4,Prodsight,https://www.prodsight.ai,United Kingdom,Artificial Intelligence,529013.0,Seed,20-Jan
...,...,...,...,...,...,...,...
3570,Headout,https://www.headout.com,United States,Travel,12000000.0,Series B,21-Sep
3571,OLIO,https://olioex.com/,United Kingdom,Wellness,43000000.0,Series B,21-Sep
3572,Trees for Life,https://treesforlife.org.uk,United Kingdom,Wellness,2737468.0,Crowdfunding,21-Sep
3573,Britishvolt,https://britishvolt.com/,United Kingdom,Wellness,70000000.0,Series B,21-Sep


################################################################################

In [78]:
# Delete column from the dataset
dataset.drop('Website', axis=1)
# or : dataset.drop(dataset.column[1], axis=1)    because "Province/State" is in index 1 of the list of columns

Unnamed: 0,Company,Region,Vertical,Funding Amount (USD),Funding Stage,Funding Date
0,Internxt,Spain,Blockchain,278940.0,Seed,20-Jan
1,Dockflow,Belgium,Logistics,292244.0,Seed,20-Jan
2,api.video,France,Developer APIs,300000.0,Seed,20-Jan
3,Buck.ai,United States,Artificial Intelligence,300000.0,Seed,20-Jan
4,Prodsight,United Kingdom,Artificial Intelligence,529013.0,Seed,20-Jan
...,...,...,...,...,...,...
3570,Headout,United States,Travel,12000000.0,Series B,21-Sep
3571,OLIO,United Kingdom,Wellness,43000000.0,Series B,21-Sep
3572,Trees for Life,United Kingdom,Wellness,2737468.0,Crowdfunding,21-Sep
3573,Britishvolt,United Kingdom,Wellness,70000000.0,Series B,21-Sep


In [79]:
# Delete a row at index 1
dataset.drop([1], axis=0).head(5)

Unnamed: 0,Company,Website,Region,Vertical,Funding Amount (USD),Funding Stage,Funding Date
0,Internxt,https://internxt.com/,Spain,Blockchain,278940.0,Seed,20-Jan
2,api.video,https://api.video,France,Developer APIs,300000.0,Seed,20-Jan
3,Buck.ai,https://buck.ai/,United States,Artificial Intelligence,300000.0,Seed,20-Jan
4,Prodsight,https://www.prodsight.ai,United Kingdom,Artificial Intelligence,529013.0,Seed,20-Jan
5,Harvestr,http://harvestr.io/,France,B2B Software,650000.0,Angel,20-Jan


################################################################################

In [81]:
# Find unique values
dataset['Funding Stage'].unique()

array(['Seed', 'Angel', 'Series A', 'Series B', 'Series C', 'Series D',
       'Series E', 'Series F', 'Series G', 'Debt Financing', 'Pre-Seed',
       'Unknown', 'ICO', 'Initial Coin Offering', 'Unkown',
       'Private Equity', 'Undisclosed', 'Grant', 'Series H', 'Funding',
       'Crowdfunding', 'Growth'], dtype=object)