# Baseline

In [1]:
# General libraries.
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn library for importing the newsgroup data.
#from sklearn.datasets import fetch_20newsgroups

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

In [2]:
train_df = pd.read_json("../input/train.json")
test_df = pd.read_json("../input/test.json")

In [14]:
train_df.head()
#test_df.head()


Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,longitude,manager_id,photos,price,street_address
10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],medium,40.7145,7211212,-73.9425,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue
10000,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,"[Doorman, Elevator, Fitness Center, Cats Allow...",low,40.7947,7150865,-73.9667,7533621a882f71e25173b27e3139d83d,[https://photos.renthop.com/2/7150865_be3306c5...,5465,808 Columbus Avenue
100004,1.0,1,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",W 13 Street,"[Laundry In Building, Dishwasher, Hardwood Flo...",high,40.7388,6887163,-74.0018,d9039c43983f6e564b1482b273bd7b01,[https://photos.renthop.com/2/6887163_de85c427...,2850,241 W 13 Street
100007,1.0,1,28d9ad350afeaab8027513a3e52ac8d5,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,East 49th Street,"[Hardwood Floors, No Fee]",low,40.7539,6888711,-73.9677,1067e078446a7897d2da493d2f741316,[https://photos.renthop.com/2/6888711_6e660cee...,3275,333 East 49th Street
100013,1.0,4,0,2016-04-28 01:32:41,Beautifully renovated 3 bedroom flex 4 bedroom...,West 143rd Street,[Pre-War],low,40.8241,6934781,-73.9493,98e13ad4b495b9613cef886d79a6291f,[https://photos.renthop.com/2/6934781_1fa4b41a...,3350,500 West 143rd Street


In [50]:
#train_df.describe()
#test_df.describe()
#train_df['created'].describe()
train_df['created2'] = train_df['created'].str[0:10].astype("datetime64");
train_df["created3"] = train_df['created'].astype("datetime64");

In [54]:
#http://stackoverflow.com/questions/27365467/python-pandas-plot-histogram-of-dates
print(train_df["created2"].groupby([train_df["created2"].dt.year, train_df["created2"].dt.month]).count())

print(train_df["created3"].groupby([train_df["created3"].dt.dayofweek]).count())
print(train_df["created3"].groupby([train_df["created3"].dt.hour]).count())

created2  created2
2016      4           16411
          5           15797
          6           17144
Name: created2, dtype: int64
created3
0    4239
1    8417
2    9414
3    8195
4    7593
5    6953
6    4541
Name: created3, dtype: int64
created3
0        55
1      5749
2     10596
3      8318
4      5021
5      7954
6      4446
7      1047
8       336
9       135
10      284
11      444
12      693
13      616
14      782
15      753
16      370
17      443
18      450
19      266
20      191
21      178
22      139
23       86
Name: created3, dtype: int64


In [11]:
#train_df.shape
test_df.shape

(74659, 14)

In [54]:
#use description as the training data... just to start
train_data = train_df['description'].values
train_labels = train_df['interest_level'].values
test_data = test_df['description'].values
listing_ids = test_df['listing_id'].values

In [55]:
#Preprocessor
def pre_proc(s,
              word_length_range=(3,7),
              remove_stop_words=True,
              scale_capitals=1,
              set_to_lower=True,
              remove_numbers=False
             ):
   
    s2 = re.sub(ur"\p{P}+","",s) #strip punctuation
    s2 = re.sub(ur"[^\w ]+"," ",s2) #remove punctuation2
    s2 = re.sub(ur"\_","",s2) #remove underscores (ignored by w)
    
    #http://stackoverflow.com/questions/8745821/find-words-with-capital-letters-not-at-start-of-a-sentence-with-regex
    #doesn't matter if at start of sentence, often it's the key NP. If a stopword, those get stripped anyway
    names = " "+" ".join(re.findall(ur'\b[A-Z][A-Za-z0-9]*\b',s2))
    for i in range(0,scale_capitals):
        s2 = s2 + names
        
    if set_to_lower:
        s2 = s2.lower() #lower case

    s2 = re.sub(ur"\s+", " ",s2) #remove mult spaces (avoids cases with double spaces for look behind)
    
    if remove_numbers:
        s2 = re.sub(ur"\d", " ",s2) #remove all numbers

    truncation_re = ur"(?<=(\s\w{"+ur"{}".format(word_length_range[1])+ur"}))(\w*\s)"
    s2 = re.sub(truncation_re,"\1 ",s2) #truncate words > n char

    short_elim_re = ur"\b\w{1,"+ur"{}".format(word_length_range[0])+ur"}\b"
    s2 = re.sub(short_elim_re, "", s2) #removes all words/numbers < n in length
    
    #remomve stop words
    if remove_stop_words:
        s2_split = s2.split()
        s3_split = s2.split()
        for key in s2_split:
            if key.lower in stop_words:
                s3_split.remove(key)
        s2 =' '.join(s3_split)
    return s2



In [57]:
pre_proc_custom = lambda x: pre_proc(x, 
                                      word_length_range = (3,7), 
                                      remove_stop_words = False, 
                                      scale_capitals = 1, 
                                      set_to_lower = True,
                                      remove_numbers = False
                                     )

mytv = TfidfVectorizer(ngram_range=(1,1), 
                       analyzer='word', 
                       preprocessor=pre_proc_custom)
X_train = mytv.fit_transform(train_data)
train_words = mytv.get_feature_names()

mytv_test = TfidfVectorizer(ngram_range=(1,1), 
                           analyzer='word', 
                           preprocessor=pre_proc_custom, 
                           vocabulary=train_words) 
X_test= mytv_test.fit_transform(test_data) 

mnb = MultinomialNB(alpha = 0.009)
mnb.fit(X_train, train_labels)
print(mnb.score(X_train, train_labels))

0.766108769655


In [70]:
labels = mnb.classes_
print(labels)
predictions = mnb.predict_proba(X_test)
print(predictions)
indexes = listing_ids
print(indexes)


print(indexes[0])
my_df = pd.DataFrame(data=predictions, index=indexes, columns=labels)  
my_df.index.names = ['listing_id']


cols = my_df.columns.tolist()
print(cols)
cols = [cols[0], cols[2], cols[1]]
print(cols)

print(my_df[cols])

[u'high' u'low' u'medium']
[[  1.60099714e-01   3.41130106e-01   4.98770180e-01]
 [  6.10717785e-02   6.98645956e-01   2.40282266e-01]
 [  2.42391744e-02   6.35857398e-01   3.39903428e-01]
 ..., 
 [  2.13109434e-02   7.83457223e-01   1.95231833e-01]
 [  1.05865747e-01   6.23553063e-01   2.70581190e-01]
 [  6.22065182e-04   8.86063021e-01   1.13314914e-01]]
[7142618 7210040 7103890 ..., 6882352 6884758 6924212]
7142618
[u'high', u'low', u'medium']
[u'high', u'medium', u'low']
                high    medium       low
listing_id                              
7142618     0.160100  0.498770  0.341130
7210040     0.061072  0.240282  0.698646
7103890     0.024239  0.339903  0.635857
7143442     0.016602  0.077800  0.905598
6860601     0.021977  0.277620  0.700403
6840081     0.012090  0.040225  0.947685
6922337     0.051292  0.261592  0.687115
6913616     0.124326  0.244808  0.630866
6937820     0.099575  0.281191  0.619234
6893933     0.160779  0.029315  0.809905
6832604     0.077788  0.2275

In [71]:
my_df[cols].to_csv("sumbmission001.csv", index=True)