In [1]:
import numpy as np

In [2]:
small_counts = np.random.randint(0, 100, 20)
small_counts

array([34, 78, 56, 22, 69, 11, 63, 34, 36, 69, 80, 25, 54, 47, 82, 15,  2,
       13,  2, 42])

In [3]:
np.floor_divide(small_counts, 10)

array([3, 7, 5, 2, 6, 1, 6, 3, 3, 6, 8, 2, 5, 4, 8, 1, 0, 1, 0, 4])

In [4]:
large_counts = [296, 8286, 64011, 80, 3, 725, 867, 2215, 7689, 11495, 91897, 44, 28, 7971, 926, 122, 22222]

In [5]:
np.floor(np.log10(large_counts))

array([2., 3., 4., 1., 0., 2., 2., 3., 3., 4., 4., 1., 1., 3., 2., 2., 4.])

In [6]:
import pandas as pd

In [7]:
pd.qcut(large_counts, 4, labels=False)

array([1, 2, 3, 0, 0, 1, 1, 2, 2, 3, 3, 0, 0, 2, 1, 0, 3])

In [8]:
large_counts_series = pd.Series(large_counts)
large_counts_series.quantile([0.25, 0.5, 0.75])

0.25     122.0
0.50     926.0
0.75    8286.0
dtype: float64

In [9]:
import pandas as pd
import json

In [10]:
biz_f = open('data/yelp_academic_dataset_business.json')
biz_df = pd.DataFrame([json.loads(x) for x in biz_f.readlines()])
biz_f.close()

In [11]:
biz_df['review_count'].max()

1170

In [12]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib notebook

In [13]:
sns.set_style('whitegrid')
fig, ax = plt.subplots()
biz_df['review_count'].hist(ax=ax, bins=100)
ax.set_yscale('log')
ax.tick_params(labelsize=14)
ax.set_xlabel('Review Count', fontsize=14)
ax.set_ylabel('Occurrence', fontsize=14)

<IPython.core.display.Javascript object>

Text(0, 0.5, 'Occurrence')

In [14]:
deciles = biz_df['review_count'].quantile([.1, .2, .3, .4, .5, .6, .7, .8, .9])
deciles

0.1     3.0
0.2     3.0
0.3     4.0
0.4     5.0
0.5     6.0
0.6     8.0
0.7    13.0
0.8    24.0
0.9    56.0
Name: review_count, dtype: float64

In [15]:
sns.set_style('whitegrid')
fig, ax = plt.subplots()
biz_df['review_count'].hist(ax=ax, bins=100)
for pos in deciles:
    handle = plt.axvline(pos, color='r')
ax.legend([handle], ['deciles'], fontsize=14)
ax.set_yscale('log')
ax.set_xscale('log')
ax.tick_params(labelsize=14)
ax.set_xlabel('Review Count', fontsize=14)
ax.set_ylabel('Occurrence', fontsize=14)

<IPython.core.display.Javascript object>

Text(0, 0.5, 'Occurrence')

In [16]:
import numpy as np

In [17]:
y = np.arange(0.00001, 3, 0.01)
x = np.power(10, y)

In [18]:
fig, ax = plt.subplots()
plt.plot(x, y, 'b')
ax.tick_params(labelsize=14)
ax.set_xlabel('x', fontsize=14)
ax.set_ylabel('log10(x)', fontsize=14)

<IPython.core.display.Javascript object>

Text(0, 0.5, 'log10(x)')

In [19]:
log_review_count = np.log10(biz_df['review_count'] + 1)

In [20]:
plt.figure()
ax = plt.subplot(2,1,1)
biz_df['review_count'].hist(ax=ax, bins=100)
ax.tick_params(labelsize=14)
ax.set_xlabel('review_count', fontsize=14)
ax.set_ylabel('Occurrence', fontsize=14)

ax = plt.subplot(2,1,2)
log_review_count.hist(ax=ax, bins=100)
ax.tick_params(labelsize=14)
ax.set_xlabel('log10(review_count))', fontsize=14)
ax.set_ylabel('Occurrence', fontsize=14)

<IPython.core.display.Javascript object>

Text(0, 0.5, 'Occurrence')

In [21]:
biz_df

Unnamed: 0,business_id,full_address,hours,open,categories,city,review_count,name,neighborhoods,longitude,state,stars,latitude,attributes,type
0,O_X3PGhk3Y5JWVi866qlJg,"1501 W Bell Rd\nPhoenix, AZ 85023","{'Monday': {'close': '18:00', 'open': '11:00'}...",True,"[Active Life, Arts & Entertainment, Stadiums &...",Phoenix,29,Turf Paradise Race Course,[],-112.092329,AZ,4.0,33.638573,"{'Take-out': False, 'Wi-Fi': 'free', 'Good For...",business
1,QbrM7wqtmoNncqjc6GtFaQ,"18501 N 83rd Avenue\nGlendale, AZ 85308",{},True,"[Tires, Automotive, Fashion, Shopping, Departm...",Glendale,3,Sam's Club Members Only,[],-112.234755,AZ,3.5,33.648545,"{'Parking': {'garage': False, 'street': False,...",business
2,7lbvsGKzhjuX3oJtaXJvOg,"5000 S Arizona Mills Cir\nSte 590\nTempe, AZ 8...","{'Monday': {'close': '21:00', 'open': '10:00'}...",True,"[Women's Clothing, Men's Clothing, Fashion, Sh...",Tempe,7,Forever 21,[],-111.964485,AZ,3.5,33.383123,"{'Parking': {'garage': False, 'street': False,...",business
3,gjxoKVsRJwEoa8zd9XxlAw,"912 W Sycamore Pl\nChandler, AZ 85225","{'Monday': {'close': '19:00', 'open': '06:00'}...",True,"[Pet Services, Pet Boarding/Pet Sitting, Pets]",Chandler,4,Loving Hands Pet Care,[],-111.857818,AZ,5.0,33.356472,{},business
4,V28yjMqyZnbCtabroJN_aA,"1745 W Glendale Ave\nPhoenix, AZ 85021",{},True,"[Veterinarians, Pets]",Phoenix,3,Amec Mid-City Animal Hospital,[],-112.097232,AZ,5.0,33.538493,{},business
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15580,gfSNqTICrdeewK8XINGSaA,"1909 E Ray Rd\nChandler, AZ 85225","{'Tuesday': {'close': '14:30', 'open': '11:30'...",True,"[Sushi Bars, Japanese, Restaurants]",Chandler,83,Taiko,[],-111.809582,AZ,4.0,33.319907,"{'Take-out': True, 'Wi-Fi': 'no', 'Good For': ...",business
15581,mDrnZvSXMbMIGE0IX9mMqA,"115 N 6th St\nPhoenix, AZ 85004",{},True,"[Active Life, Landmarks & Historical Buildings...",Phoenix,18,Heritage Square,[],-112.065493,AZ,4.0,33.450206,{'Good for Kids': True},business
15582,Lk4WdkxGMZAE0EpYKExIGA,"2401 E Bell Rd\nPhoenix, AZ 85032",{},True,"[Food, Convenience Stores]",Phoenix,3,Glacier Water Services,[],-112.030416,AZ,3.5,33.640146,"{'Parking': {'garage': False, 'street': False,...",business
15583,cmmbUO5ghhSQzqeOkyfMnw,"13371 W Grand Ave Ste C-101\nSurprise, AZ 85374","{'Monday': {'close': '19:00', 'open': '11:00'}...",True,"[Shopping, Mobile Phones]",Surprise,5,Sprint,[],-112.341856,AZ,2.0,33.631805,"{'Parking': {'garage': False, 'street': False,...",business


In [22]:
biz_df['log_review_count'] = np.log(biz_df['review_count'] + 1)

In [23]:
biz_train, biz_validate = cross_validation.train_test_split(biz_df, test_size=0.2)

NameError: name 'cross_validation' is not defined

In [None]:
m1 = sklearn.linear_model.LinearRegression()
m1.fit(biz_train[['log_review_count']], biz_train['stars'])
m2 = sklearn.linear_model.LinearRegression()
m2.fit(biz_train[['review_count']], biz_train['stars'])

In [None]:
print("Residual sum of squares: %.5f"
      % np.mean((m1.predict(biz_validate[['log_review_count']]) - biz_validate['stars']) ** 2))

In [None]:
print("Residual sum of squares: %.5f"
      % np.mean((m2.predict(biz_validate[['review_count']]) - biz_validate['stars']) ** 2))

## Bike Sharing Dataset

In [None]:
bike_df = pd.read_csv('data/bike_sharing/train.csv')
bike_df

In [None]:
fig, ax = plt.subplots()
bike_df['count'].hist(ax=ax, bins=100)

In [None]:
from sklearn import linear_model
from sklearn import cross_validation

In [None]:
categorical_cols = ['season', 'weather']
bike_df = pd.get_dummies(bike_df, prefix=categorical_cols, columns=categorical_cols)

In [None]:
bike_df

In [None]:
bike_df['log_count'] = np.log(bike_df['count'])

In [None]:
features = ['holiday', 'workingday', 'temp', 'atemp', 'humidity', 'windspeed',
            'season_1', 'season_2', 'season_3', 'season_4',
            'weather_1', 'weather_2', 'weather_3', 'weather_4']
train, validate = cross_validation.train_test_split(bike_df, test_size=0.2)

In [None]:
validate.shape

In [None]:
train.shape

In [None]:
lin_model = linear_model.LinearRegression()
lin_model.fit(train[features], train['count'])
log_lin_model = linear_model.LinearRegression()
log_lin_model.fit(train[features], train['log_count'])

In [None]:
# The sum of squares of the log residual errors
print("Log residual sum of squares: %.2f"
      % np.mean((np.log(1+lin_model.predict(validate[features])) - np.log(1+validate['count'])) ** 2))

In [None]:
# The sum of squares of the log residual errors
print("Log residual sum of squares: %.2f"
      % np.mean((np.log(1+np.exp(log_lin_model.predict(validate[features]))) - np.log(1+validate['count'])) ** 2))

## Kaggle loan data

In [None]:
loan_df = pd.read_csv('data/Kaggle loan/loan.csv')

In [None]:
loan_df.ix[:, range(20)]

In [None]:
loan_df.ix[:, range(20,40)]

In [None]:
loan_df.ix[:, range(40,60)]

In [None]:
loan_df.ix[:, range(60, 74)]

In [None]:
plt.figure()
loan_df['loan_amnt'].hist(bins=100)