In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("vehicles.csv")

In [3]:
df.head()

Unnamed: 0,id,url,region,region_url,price,year,manufacturer,model,condition,cylinders,...,size,type,paint_color,image_url,description,county,state,lat,long,posting_date
0,7222695916,https://prescott.craigslist.org/cto/d/prescott...,prescott,https://prescott.craigslist.org,6000,,,,,,...,,,,,,,az,,,
1,7218891961,https://fayar.craigslist.org/ctd/d/bentonville...,fayetteville,https://fayar.craigslist.org,11900,,,,,,...,,,,,,,ar,,,
2,7221797935,https://keys.craigslist.org/cto/d/summerland-k...,florida keys,https://keys.craigslist.org,21000,,,,,,...,,,,,,,fl,,,
3,7222270760,https://worcester.craigslist.org/cto/d/west-br...,worcester / central MA,https://worcester.craigslist.org,1500,,,,,,...,,,,,,,ma,,,
4,7210384030,https://greensboro.craigslist.org/cto/d/trinit...,greensboro,https://greensboro.craigslist.org,4900,,,,,,...,,,,,,,nc,,,


In [4]:
df.columns

Index(['id', 'url', 'region', 'region_url', 'price', 'year', 'manufacturer',
       'model', 'condition', 'cylinders', 'fuel', 'odometer', 'title_status',
       'transmission', 'VIN', 'drive', 'size', 'type', 'paint_color',
       'image_url', 'description', 'county', 'state', 'lat', 'long',
       'posting_date'],
      dtype='object')

In [5]:
df.dtypes

id                int64
url              object
region           object
region_url       object
price             int64
year            float64
manufacturer     object
model            object
condition        object
cylinders        object
fuel             object
odometer        float64
title_status     object
transmission     object
VIN              object
drive            object
size             object
type             object
paint_color      object
image_url        object
description      object
county          float64
state            object
lat             float64
long            float64
posting_date     object
dtype: object

In [6]:
# Select columns for analysis
df_test = df.loc[:,['price','manufacturer','cylinders','size']]

In [7]:
df_test

Unnamed: 0,price,manufacturer,cylinders,size
0,6000,,,
1,11900,,,
2,21000,,,
3,1500,,,
4,4900,,,
...,...,...,...,...
426875,23590,nissan,6 cylinders,
426876,30590,volvo,,
426877,34990,cadillac,,
426878,28990,lexus,6 cylinders,


In [9]:
# Remove rows with NA data
df_drop = df_test.dropna()

In [10]:
# Look at value counts
manufacturer_count = df_drop.manufacturer.value_counts()
manufacturer_count

ford               19390
chevrolet          15386
toyota              9202
honda               6339
nissan              5054
jeep                4178
gmc                 3968
ram                 3844
bmw                 3672
mercedes-benz       3499
dodge               3267
subaru              2371
hyundai             2348
volkswagen          2181
lexus               1912
kia                 1806
chrysler            1622
cadillac            1577
audi                1314
buick               1309
mazda               1235
acura                946
lincoln              921
infiniti             861
pontiac              812
volvo                715
mitsubishi           711
mini                 607
mercury              491
rover                485
saturn               381
porsche              344
jaguar               292
fiat                 158
alfa-romeo            71
tesla                 51
harley-davidson       43
ferrari               40
datsun                17
land rover             8


In [11]:
cylinders_count = df_drop.cylinders.value_counts()
cylinders_count

6 cylinders     35171
4 cylinders     35134
8 cylinders     30699
10 cylinders      886
5 cylinders       825
other             432
3 cylinders       233
12 cylinders       53
Name: cylinders, dtype: int64

In [11]:
size_count = df_drop.size.value_counts()
size_count

AttributeError: 'numpy.int32' object has no attribute 'value_counts'

In [12]:
# Determine which values to replace
replace_cy = list(cylinders_count[cylinders_count < 30000].index)

# Replace in DataFrame
for cy in replace_cy:
    df_drop.cylinders = df_drop.cylinders.replace(cy,"Other")

# Check to make sure binning was successful
df_drop.cylinders.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


6 cylinders    35171
4 cylinders    35134
8 cylinders    30699
Other           2429
Name: cylinders, dtype: int64

In [13]:
# Select ford, chevrolet, toyota, and honda
df_select1 = df_drop[df_drop['manufacturer'].isin(['ford','chevrolet','toyota','honda'])]

In [14]:
df_select1

Unnamed: 0,price,manufacturer,cylinders,size
31,15000,ford,6 cylinders,full-size
55,19900,ford,8 cylinders,full-size
59,14000,honda,6 cylinders,full-size
65,22500,ford,8 cylinders,full-size
119,17500,toyota,6 cylinders,full-size
...,...,...,...,...
426785,23495,ford,8 cylinders,full-size
426788,12995,chevrolet,4 cylinders,compact
426792,32999,ford,8 cylinders,full-size
426793,15999,chevrolet,4 cylinders,mid-size


In [15]:
# Create features and target
X = df_select1.copy().drop('price',axis=1)
y = df_select1['price']

In [16]:
X = pd.get_dummies(X, columns=['manufacturer','cylinders','size'])

In [17]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.metrics import classification_report_imbalanced

In [18]:
# Create training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1)

In [19]:
X

Unnamed: 0,manufacturer_chevrolet,manufacturer_ford,manufacturer_honda,manufacturer_toyota,cylinders_4 cylinders,cylinders_6 cylinders,cylinders_8 cylinders,cylinders_Other,size_compact,size_full-size,size_mid-size,size_sub-compact
31,0,1,0,0,0,1,0,0,0,1,0,0
55,0,1,0,0,0,0,1,0,0,1,0,0
59,0,0,1,0,0,1,0,0,0,1,0,0
65,0,1,0,0,0,0,1,0,0,1,0,0
119,0,0,0,1,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
426785,0,1,0,0,0,0,1,0,0,1,0,0
426788,1,0,0,0,1,0,0,0,1,0,0,0
426792,0,1,0,0,0,0,1,0,0,1,0,0
426793,1,0,0,0,1,0,0,0,0,0,1,0


In [20]:
from sklearn import linear_model

In [21]:
# Create model
regr = linear_model.LinearRegression()

In [22]:
# Fit model
regr.fit(X_train, y_train)

LinearRegression()

In [23]:
# Make prediction
pred = regr.predict(X_test)

In [24]:
# Calculate confusion matrix
cm = confusion_matrix(y_test, pred)

In [25]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [28]:
# Create prediction DataFrame
data = {'price': y_test,
       'prediction': pred}
pred_df = pd.DataFrame(data)

In [29]:
pred_df

Unnamed: 0,price,prediction
232402,0,-51072.0
339798,30985,347520.0
270618,5500,-51072.0
420141,28994,-51072.0
98308,6995,634496.0
...,...,...
105783,20992,553088.0
147561,10950,81664.0
271883,35000,357632.0
359853,2795,86400.0
