In [133]:
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import re
import pandas as pd
import numpy as np
import datetime as dt


#from collections import Counter

## Load Databases

In [56]:
companies_df = pd.read_csv("/Users/jbpatty/project-3/companies.csv")

acquisitions_df = pd.read_csv("/Users/jbpatty/project-3/acquisitions.csv")

investments_df = pd.read_csv("/Users/jbpatty/project-3/investments.csv")

rounds_df = pd.read_csv("/Users/jbpatty/project-3/rounds.csv")

additions_df = pd.read_csv("/Users/jbpatty/project-3/additions.csv") 

## Clean Databases

### Company Database

In [68]:
companies_df.dtypes

permalink            object
name                 object
homepage_url         object
category_list        object
funding_total_usd    object
status               object
country_code         object
state_code           object
region               object
city                 object
funding_rounds        int64
founded_at           object
first_funding_at     object
last_funding_at      object
dtype: object

In [69]:
companies_df.shape

(66368, 14)

#### Remove companies that do not have a name

In [104]:
clean_companies_df = companies_df.dropna(subset = ['name'])
clean_companies_df.shape

(66367, 14)

#### Remove companies that do not have a first funding date

In [105]:
clean_companies_df = clean_companies_df.dropna(subset = ['first_funding_at'])
clean_companies_df.shape

(66343, 14)

#### Remove dates missing a year (year 1000 is just wrong) for both first and last funding date using raw string notation from Regex. Years will start with 19__ or 20__

In [106]:
clean_companies_df = clean_companies_df[clean_companies_df.first_funding_at.str.match(r'(19)|(20)')]

clean_companies_df = clean_companies_df[clean_companies_df.last_funding_at.str.match(r'(19)|(20)')]

clean_companies_df.shape

(66338, 14)

#### Adding datetime functionality 

In [120]:
clean_companies_df.first_funding_at = pd.to_datetime(clean_companies_df.loc[:, 'first_funding_at'])

clean_companies_df.last_funding_at = pd.to_datetime(clean_companies_df.loc[:, 'last_funding_at'])

clean_companies_df.dtypes

permalink                    object
name                         object
category_list                object
funding_total_usd           float64
status                       object
country_code                 object
state_code                   object
region                       object
city                         object
funding_rounds                int64
founded_at                   object
first_funding_at     datetime64[ns]
last_funding_at      datetime64[ns]
dtype: object

#### Convert funding total and funding rounds from objects to numeric values

In [108]:
# Convert '-' into NaN values
clean_companies_df.funding_total_usd.replace('-', np.nan, inplace=True)

# Convert object to numeric float values
clean_companies_df.funding_total_usd = pd.to_numeric(clean_companies_df.loc[:, 'funding_total_usd'])

# Convert NaN into 0
clean_companies_df["funding_total_usd"] = clean_companies_df["funding_total_usd"].fillna(0)

In [109]:
clean_companies_df.head()

Unnamed: 0,permalink,name,homepage_url,category_list,funding_total_usd,status,country_code,state_code,region,city,funding_rounds,founded_at,first_funding_at,last_funding_at
0,/organization/-fame,#fame,http://livfame.com,Media,10000000.0,operating,IND,16,Mumbai,Mumbai,1,,2015-01-05,2015-01-05
1,/organization/-qounter,:Qounter,http://www.qounter.com,Application Platforms|Real Time|Social Network...,700000.0,operating,USA,DE,DE - Other,Delaware City,2,2014-09-04,2014-03-01,2014-10-14
2,/organization/-the-one-of-them-inc-,"(THE) ONE of THEM,Inc.",http://oneofthem.jp,Apps|Games|Mobile,3406878.0,operating,,,,,1,,2014-01-30,2014-01-30
3,/organization/0-6-com,0-6.com,http://www.0-6.com,Curated Web,2000000.0,operating,CHN,22,Beijing,Beijing,1,2007-01-01,2008-03-19,2008-03-19
4,/organization/004-technologies,004 Technologies,http://004gmbh.de/en/004-interact,Software,0.0,operating,USA,IL,"Springfield, Illinois",Champaign,1,2010-01-01,2014-07-24,2014-07-24


In [113]:
# Convert object to numeric integer values
clean_companies_df.funding_rounds = pd.to_numeric(clean_companies_df.loc[:, 'funding_rounds'])

In [115]:
clean_companies_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 66338 entries, 0 to 66367
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   permalink          66338 non-null  object        
 1   name               66338 non-null  object        
 2   homepage_url       61280 non-null  object        
 3   category_list      63191 non-null  object        
 4   funding_total_usd  66338 non-null  float64       
 5   status             66338 non-null  object        
 6   country_code       59383 non-null  object        
 7   state_code         57794 non-null  object        
 8   region             58311 non-null  object        
 9   city               58313 non-null  object        
 10  funding_rounds     66338 non-null  int64         
 11  founded_at         51122 non-null  object        
 12  first_funding_at   66338 non-null  datetime64[ns]
 13  last_funding_at    66338 non-null  datetime64[ns]
dtypes: dat

In [116]:
clean_companies_df.status.value_counts()

operating    53008
closed        6235
acquired      5548
ipo           1547
Name: status, dtype: int64

#### Drop unncessary columns 

In [117]:
clean_companies_df.drop(columns=['homepage_url'], axis=1, inplace=True)

In [119]:
clean_companies_df.tail()

Unnamed: 0,permalink,name,category_list,funding_total_usd,status,country_code,state_code,region,city,funding_rounds,founded_at,first_funding_at,last_funding_at
66363,/organization/zznode-science-and-technology-co...,ZZNode Science and Technology,Enterprise Software,1587301.0,operating,CHN,22,Beijing,Beijing,1,,2012-04-01,2012-04-01
66364,/organization/zzzzapp-com,Zzzzapp Wireless ltd.,Advertising|Mobile|Web Development|Wireless,114304.0,operating,HRV,15,Split,Split,4,2012-05-13,2011-11-01,2014-03-01
66365,/organization/Áeron,ÁERON,,0.0,operating,,,,,1,2011-01-01,2014-08-01,2014-08-01
66366,/organization/Ôasys-2,Ôasys,Consumer Electronics|Internet of Things|Teleco...,18192.0,operating,USA,CA,SF Bay Area,San Francisco,1,2014-01-01,2015-01-01,2015-01-01
66367,/organization/İnovatiff-reklam-ve-tanıtım-hizm...,İnovatiff Reklam ve Tanıtım Hizmetleri Tic,Consumer Goods|E-Commerce|Internet,14851.0,operating,,,,,1,,2013-10-01,2013-10-01


In [123]:
clean_companies_df.to_pickle("/Users/jbpatty/project-3/clean_companies_df")