In [1]:
import pandas as pd
import numpy as np

In [2]:
employees = pd.read_csv('data/toyota_number_of_employees.csv', index_col = 0)
employees.head()

Unnamed: 0_level_0,Unconsolidated number of employees,Consolidated number of employees
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2007,67650,299394
2008,69478,316121
2009,71116,320808
2010,71567,320590
2011,69125,317716


In [3]:
# From the production data we only need the header and the worldwide production numbers
rows_to_keep = [0,32]
production = pd.read_csv('data/toyota_production_by_country.csv', skiprows = lambda x: x not in rows_to_keep, index_col = 0)
employees.index = production.transpose().index
temp = employees.assign(worldwide_production = production.transpose())
temp.head()

Unnamed: 0,Unconsolidated number of employees,Consolidated number of employees,worldwide_production
2007,67650,299394,8534690
2008,69478,316121,8210818
2009,71116,320808,6371291
2010,71567,320590,7623349
2011,69125,317716,6928813


In [4]:
# Read total assets
total_assets = pd.read_csv('data/toyota_total_assets.csv', index_col = 0)
temp.index = total_assets.index
temp2 = temp.assign(total_assets = total_assets)
temp2.head()

Unnamed: 0_level_0,Unconsolidated number of employees,Consolidated number of employees,worldwide_production,total_assets
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2007,67650,299394,8534690,32574.779
2008,69478,316121,8210818,32458.32
2009,71116,320808,6371291,29062.037
2010,71567,320590,7623349,30349.287
2011,69125,317716,6928813,29818.166


In [5]:
# Take natural logarithm of the quantities L/K and P/K for each year and add as new columns
temp2['x'] = np.log(temp2['Unconsolidated number of employees']/temp2['total_assets'])
temp2['y'] = np.log(temp2['worldwide_production']/temp2['total_assets'])

temp2.head()

Unnamed: 0_level_0,Unconsolidated number of employees,Consolidated number of employees,worldwide_production,total_assets,x,y
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2007,67650,299394,8534690,32574.779,0.730809,5.568356
2008,69478,316121,8210818,32458.32,0.761053,5.533251
2009,71116,320808,6371291,29062.037,0.89488,5.390125
2010,71567,320590,7623349,30349.287,0.857861,5.526198
2011,69125,317716,6928813,29818.166,0.840799,5.448326


In [6]:
from sklearn.linear_model import LinearRegression

linreg = LinearRegression()

X = temp2[['x']]

# Fit a linear model to points (x,y)
fit = linreg.fit(X, temp2['y'])

In [12]:
#print(fit.summary())

In [8]:
# Get the predictions for X and record these predictions to the data frame
pred = linreg.predict(X)
temp2['predictions'] = pred

temp2.head()

Unnamed: 0_level_0,Unconsolidated number of employees,Consolidated number of employees,worldwide_production,total_assets,x,y,predictions
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2007,67650,299394,8534690,32574.779,0.730809,5.568356,5.464343
2008,69478,316121,8210818,32458.32,0.761053,5.533251,5.484831
2009,71116,320808,6371291,29062.037,0.89488,5.390125,5.575489
2010,71567,320590,7623349,30349.287,0.857861,5.526198,5.550411
2011,69125,317716,6928813,29818.166,0.840799,5.448326,5.538853


In [9]:
# Plot data and model
import matplotlib.pyplot as plt

plt.scatter(temp2['x'], temp2['y'])
plt.plot(temp2['x'], temp2['predictions'], color='red')

plt.show()

<Figure size 640x480 with 1 Axes>

In [10]:
coefficient = linreg.coef_.astype(float)[0]
intercept = linreg.intercept_

print("Coefficient is {:f} and intercept is {:f}".format(coefficient,intercept))

Coefficient is 0.677427 and intercept is 4.969273
