In [51]:
import pandas as pd

companies = pd.read_csv("companies.txt", delimiter="\t", encoding='ISO-8859-1')
round2 = pd.read_csv("rounds2.csv", encoding='ISO-8859-1')

# Setting the value of key column to lower case to ensure consistency with joining dataframe
companies['permalink'] = companies['permalink'].str.lower()
round2['company_permalink'] = round2['company_permalink'].str.lower()

# Replacing all NaN (Null) values with 0 to ensure that we get consistent aggregations
round2['raised_amount_usd'].fillna(0, inplace=True)

# Table 1.1 outcome
print("\n\nTable 1.1")
print("----------\n")

# Get details of unique companies in companies file
print("Unique values in companies DataFrame :", companies['permalink'].nunique())

# Get details of unique companies in round2
print("Unique values in companies round2 :", round2['company_permalink'].nunique())

# Find out of there are companies in round2 that are not present in companies file

df1 = pd.DataFrame(companies,columns=['permalink'])
df2 = pd.DataFrame(round2,columns=['company_permalink'])

set_diff_df = pd.concat([df2, df1, df1], sort=True).drop_duplicates(keep=False)
print("Additional companies in round2 that does not exist in companies DataFrame :", set_diff_df['company_permalink'].count())

# Merger two data frames
master_frame = pd.merge(companies, round2, how='outer', left_on=['permalink'], right_on=['company_permalink'])

# Table 2.1 outcome
print("\n\nTable 2.1")
print("----------\n")

# Setting display format for float values that are an outcome of groupby
pd.set_option('display.float_format', lambda x: '%.0f' %x)

# Calculate the most representative value of investment amount by funding type = 'VENTURE'
venture_usd = round2[round2['funding_round_type'] == 'venture'].groupby('funding_round_type')['raised_amount_usd'].mean()
print ("Investment amount by funding type as 'VENTURE'        = ", '%.0f' % venture_usd)

# Calculate the most representative value of investment amount by funding type = 'ANGEL'
angel_usd = round2[round2['funding_round_type'] == 'angel'].groupby('funding_round_type')['raised_amount_usd'].mean()
print ("Investment amount by funding type as 'ANGEL'          = ", '%.0f' % angel_usd)

# Calculate the most representative value of investment amount by funding type = 'SEED'
seed_usd = round2[round2['funding_round_type'] == 'seed'].groupby('funding_round_type')['raised_amount_usd'].mean()
print ("Investment amount by funding type as 'SEED'           = ", '%.0f' % seed_usd)

# Calculate the most representative value of investment amount by funding type = 'PRIVATE EQUITY'
private_equity_usd = round2[round2['funding_round_type'] == 'private_equity'].groupby('funding_round_type')['raised_amount_usd'].mean()
print ("Investment amount by funding type as 'PRIVATE EQUITY' = ", '%.0f' % private_equity_usd)


# Table 3.1 outcome
print("\n\nTable 3.1")
print("----------\n")






Table 1.1
----------

Unique values in companies DataFrame : 66368
Unique values in companies round2 : 66370
Additional companies in round2 that does not exist in companies DataFrame : 42493


Table 2.1
----------

Investment amount by funding type as 'VENTURE'        =  10634054
Investment amount by funding type as 'ANGEL'          =  764564
Investment amount by funding type as 'SEED'           =  556607
Investment amount by funding type as 'PRIVATE EQUITY' =  62111788


In [1]:
%config IPCompleter.greedy = True