In [17]:
import pandas as pd
import numpy as np
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns

from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
from datetime import date, datetime, time, timedelta
from scipy import stats 

sns.set()
%matplotlib inline

'''$$$$$$$$$$$$$$$$$$$$ Start EDA $$$$$$$$$$$$$$$$$$$$'''
def general_explore_file(filepath, show = False):
    df = pd.read_csv(filepath)
    if show:
        explore_df(df)
    return df

def get_prep_data(filepath, parse_dt = ['Opened', 'Closed'], fix_index = 'CaseID'):
    df = pd.read_csv(filepath, parse_dates =parse_dt, infer_datetime_format=True)
    if fix_index:
        df.set_index(fix_index, inplace = True)
    return df
    
def read_csv_chunks_into_df(file_path, chunk_size, parse_date=False):    
    if parse_date:
        chunks = pd.read_csv(file_path, parse_dates= ['Opened','Closed','Updated'], \
                             infer_datetime_format=True, chunksize = chunk_size )
    else:
        chunks = pd.read_csv(file_path, chunksize = chunk_size )
    df = pd.concat(chunks) 
    return df

def explore_df(df):
    print '*********** Shape of df **************'
    print df.shape
    features = df.columns.values    
    print '********** Number of features ***************'
    print len(features) 
    print '********** Features ***************'
    print features
    print '******* Head of df ******************'
    print df.head()
    print '******* Info of df ******************'
    print df.info()
    print '******** Description of df *****************'
    print df.describe()
    return

def import_data(folder, filename):
    '''import the data and set the right datetime '''
    folder = folder
    filename_original = filename
    filepath = folder + filename_original
    df = general_explore_file(filepath)

    df = df.sort_values('CaseID')
    df.set_index('CaseID', inplace = True)

    ''' parse dates'''
    dt_list = ['Opened','Closed','Updated'] # list of datetime columns
    time_format = '%m/%d/%Y %I:%M:%S %p'
    dft = parsedate(df, dt_list, time_format) # parse ['Opened','Closed','Updated'] to timedate 
    return dft
    
def get_unique(df):
    features = df.columns.values
    print '********* Number of unique values **********'
    for feature in features:
        print feature,' ', len(df[feature].unique())
    return 

def get_missing(df):
    print '********* Number of missing values **********'
    df2 = df.isnull()
    features = df.columns.values
    for feature in features:
        temp = df[df2[feature]]
        print feature,' ', len(temp)

def drop_na_row(df,feature):
    df = df.ix[df[feature].notnull(), :]
    return df

def get_value_counts(df, feature_list):
    for feature in feature_list:
        print '************ '+feature+' value counts ***********'
        print df[feature].value_counts(dropna = False)
    return

def parsedate(df, columns, time_format):
    for column in columns:
        df[column] = pd.to_datetime(df[column], format = time_format)
    return df

def days_to_minutes(dt):
    return  dt.total_seconds()//60#(td.seconds//60)%60

def days_to_hours(dt):
    hours = dt.total_seconds()/3600#(td.seconds//60)%60
    return np.round(hours,1)

def get_sorted_category_value(df, category):
    ''' returns sorted categorical values based on the mean process_hours '''
    df1 = df.copy()
    dfm = df1.groupby(category).mean()
    dfm = dfm.sort_values('Process_hours')
    return dfm.index

def category_to_numer_dict(df, category, values):
    '''
    Change a categorical column to numeric and save the categorical values in a 
    dictionary for later reference values is sorted list of categorical values
    '''
    dict = defaultdict(str)
    for i,value in enumerate(values):
        dict[i] = value # store the categorical values in a dictionary for reference
        df.ix[df[category]==value, category] = i
    df[category].astype(int, inplace=True)
    return dict

def category_to_numer_basic(df, category):
    '''Change a categorical column to numeric and save the categorical 
    values in a dictionary for later reference (basic version)'''
    values = df[category].unique()
    for i,value in enumerator(values):
        df.ix[df[category]==value, category] = i
    df[category].astype(int, inplace=True)
    return df

def batch_process_categories(df, categories):
    '''convert categorical features to numerical by batch, 
    return a dictionary of dictionaries storing the mapping of categorical value to number'''
    cate_dict = {}
    for category in categories:
        '''Convert the categoricl column to numerical'''
        if  category in df.columns.values:
            cate_val = get_sorted_category_value(df,category) 
            '''The category_to_numer_dict() modify the input dataframe by side-effect and return a dictionary'''
            cate_dict[category] = category_to_numer_dict(df, category, cate_val)
    return cate_dict

def check_group_mean(df, groupby_cols, target_cols):
    for col in groupby_cols:
        dfm = df.groupby(col).mean()
        print dfm[target_cols]
    return

def check_group_stats(df, groupby_cols, target_cols):
    for col in groupby_cols:
        dfm = df.groupby(col).describe()
        print dfm[target_cols]
    return
'''$$$$$$$$$$$$$$$$$$$$ Finishing EDA $$$$$$$$$$$$$$$$$$$$'''

'''$$$$$$$$$$$$$$$$$$$$ End importing data $$$$$$$$$$$$$$$$$$$$'''
def clean_data(dft):
    '''remove and save the cases that are not closed'''
    dft_still_open = dft[dft['Closed'].isnull()] # cases that not closed
    filename_open = 'SF311_still_open_raw.csv'
    dft_still_open_csv_path = folder + filename_open
    dft_still_open.to_csv(dft_still_open_csv_path) # dft_still_open.csv contains cases that not closed

    '''calculate the process time '''
    condition = dft['Closed'].notnull()
    dft_closed = dft[condition] # cases that are closed 
    dft_closed['Process_days'] = dft_closed['Closed'] - dft_closed['Opened']
    dft_closed['Process_hours'] = dft_closed['Process_days'].apply(days_to_hours)

    ''' remove cases with process time <= 0 hours and save theses cases '''
    dft_wrong_dates = dft_closed[dft_closed['Process_hours'] <= 0]
    filename_wrong_dates = 'SF311_wrong_dates_raw.csv'
    dft_wrong_dates_csv_path = folder + filename_wrong_dates
    dft_wrong_dates.to_csv(dft_wrong_dates_csv_path) 
    # dft_wrong_dates_raw.csv contains cases that have wrong dates: closed before opened'''re
    dft_right_dates = dft_closed[dft_closed['Process_hours'] > 0]

    ''' remove duplicated cases and save theses cases '''
    dft_duplicates, dft_valid = check_word_in_col(dft_right_dates, 'Status Notes', 'Duplicate')
    filename_duplicates = 'SF311_duplicates_raw.csv'
    dft_duplicates_csv_path = folder + filename_duplicates
    dft_duplicates.to_csv(dft_duplicates_csv_path) # dft_duplicates_raw.csv contains cases that are duplicated
    
    '''convert Process_days to float'''
    dft_valid['Process_days'] = dft_valid['Process_hours']/24.0
    
    '''save raw valid cases'''
    filename_valid = 'SF311_valid_raw.csv'
    dft_valid_csv_path = folder + filename_valid
    dft_valid.to_csv(dft_valid_csv_path)
    
    '''remove unnecessary columns and save the cases to csv file'''
    drop_col = ['Updated','Status', 'Media URL']
    dft_valid_reduced = dft_valid.drop(drop_col, axis =1)
    filename_reduced = 'SF311_valid_reduced.csv'
    dft_valid_reduced_csv_path = folder + filename_reduced
    dft_valid_reduced.to_csv(dft_valid_reduced_csv_path)
    
    print 'Number of original cases: ', len(dft)
    print 'Cases that are not closed: ', len(dft_still_open)
    print 'Cases with process time <= 0: ', len(dft_wrong_dates)
    print 'Cases with process time > 0: ', len(dft_right_dates)
    print 'Duplicated cases: ', len(dft_duplicates)
    print 'Valid cases: ', len(dft_valid)
    return dft_valid_reduced


def plot_data_on_date(df, data_col, year = False, month = False, day = False, dot = True):
    ''' set index to date and plot df column data against the index, year can be Boolean or int'''
    dfcp = df.copy()
    dfcp.set_index('Opened', inplace = True)
    if (type(year)==int) & (type(month)==int) & (type(day)==int):
        cond1 = dfcp.index.year == year
        cond2 = dfcp.index.month == month
        cond3 = dfcp.index.day == day
        dfcp1 = dfcp[cond1 & cond2 & cond3]           
    elif (type(year)==int) & (type(month)==int):
        cond1 = dfcp.index.year == year
        cond2 = dfcp.index.month == month
        dfcp1 = dfcp[cond1 & cond2]        
    elif type(year)==int:
        dfcp1 = dfcp[dfcp.index.year == year]
    else:
        dfcp1 = dfcp
    if dot:     
        dfcp1[data_col].plot(figsize=(18,16), c='m', alpha = 0.2,style='o')
    else:
        dfcp1[data_col].plot(figsize=(18,16), c='k', alpha = 0.2)
    plt.show()
    return

'''This function is not necessary, becasue plot_data_on_date() can do scatter plot'''
'''plot a scatter plot on data'''
def scatter_data_on_date(df, data_col, year = False, month = False, day = False):
    ''' set index to date and plot df column data against the index; year, month, and day can be Boolean or int'''
    dfcp = df.copy()
    dfcp['Opened_Int'] = dfcp['Opened'].astype(np.int64)
    dfcp.set_index('Opened', inplace = True)
    if (type(year)==int) & (type(month)==int) & (type(day)==int):
        cond1 = dfcp.index.year == year
        cond2 = dfcp.index.month == month
        cond3 = dfcp.index.day == day
        dfcp1 = dfcp[cond1 & cond2 & cond3]                   
    elif (type(year)==int) & (type(month)==int):
        cond1 = dfcp.index.year == year
        cond2 = dfcp.index.month == month
        dfcp1 = dfcp[cond1 & cond2]        
    elif type(year)==int:
        dfcp1 = dfcp[dfcp.index.year == year]        
    else:
        dfcp1 = dfcp
    dfcp1.plot(kind = 'scatter', x='Opened_Int', y='Process_days', alpha = 0.2, c = 'm', figsize=(20,10))  
    return


'''######## this can be a short lambda expression########'''
def get_str_list(string):
    '''convert to a list of string'''
    return str(string).split()

'''######## this can be a short lambda expression########'''
def check_dup(str_list):
    '''check if word 'Duplicate' is in the string list'''
    return 'Duplicate' in str_list

def check_word_in_col(df, column, word):
    '''check if a word in the column, returns a tuple of dataframes, 
    the first one contains the word and second one does not''' 
    df1 = df.copy()
    get_str_list = lambda x: str(x).split()
    check_dup = lambda x: word in x

    df1[column+'1'] = df1[column].apply(get_str_list) # turn df1[column] into a list of strings
    cond = df1[column+'1'].apply(check_dup) # check if df1[column+'1'] contains the word
    df_found = df[cond]
    df_not_found = df[~cond]
    return df_found, df_not_found

def add_features(df):
    df1 = df.copy()
    '''Add features: Day of week, Month, Year, Weekend '''
    df1['Day_Of_Week'] = df1['Opened'].dt.dayofweek
    df1['Month'] = df1['Opened'].dt.month
    df1['Year'] = df1['Opened'].dt.year
    
    df1['Weekend'] = (df1['Day_Of_Week'].isin((5,6))).astype(int) # if the open day is at weekend
    '''Add feature Holiday and Before_Holiday'''
    df1['Opened_Int'] = df1['Opened'].astype(np.int64)
    
    cal = calendar()
    holidays = cal.holidays()
    df1['Holiday'] = ((df1['Opened'].dt.date).astype('datetime64').isin(holidays)).astype(int)
    df1['Before_Holiday'] = (((df1['Opened'].dt.date).astype('datetime64')+timedelta(days = 1))\
                             .isin(holidays)).astype(int)
    num_of_holiday = len(df1[df1['Holiday'] == 1])
    num_of_before_holiday = len(df1[df1['Before_Holiday'] == 1])
    return df1

def get_oneway_anova(df, target_col, group_col, group_list=False):
    if group_list:
        groups = group_list
    else:
        groups = list(df[group_col].unique())
    datasets =[]
    for component in groups:
        df_temp = df[df[group_col] == component]
        datasets.append(np.array(df_temp[target_col]))
    f_val, p_val = stats.f_oneway(*datasets)  
    print "One-way ANOVA P =", p_val  
    return p_val

def add_current_open(df):
    '''add column Current_Open which has the number of current open cases'''
    def get_open_cases(opt):
        condition1 = df['Opened'] < opt
        condition2 = df['Closed'] > opt
        open_cases = df[condition1 & condition2]
        return len(open_cases)
    df['Current_Open'] = df['Opened'].apply(get_open_cases)
    return df

def create_pilot(df, folder, filename):    
    '''Create a pilot dataset of most recent 100000 cases for preliminary 
    modeling and feature engineering, save as SF311_pilot.csv '''
    pilot = df.iloc[:100000,:]
    pilot.to_csv(folder+filename_pilot)
    return

def impute_neighbor_knn():
    pass

def creat_data_chunk(df, folder, filename, days = 730):
    if timedelta(days = days) > df['Opened'].max()-df['Opened'].min():
        print 'Error! Days over the limit!'
        return
    new_end_date = df['Opened'].max() - timedelta(days = days) # get the date that is 2 year before the data collecting date
    condition1 = df['Opened'] <= new_end_date
    condition2 = df['Process_days'] <= days
    df_chunk = df[condition1 & condition2]
    df_chunk.to_csv(folder+filename)
    return

def train_test_df_split(df, test_size = 0.2, random_seed = 111):
    np.random.seed(seed = random_seed)
    df['Flag'] = np.random.random(size = len(df)) <= test_size
    df_train = df[~df['Flag']]
    df_test = df[df['Flag']]
    df_train.drop('Flag', axis=1, inplace = True)
    df_test.drop('Flag', axis=1, inplace = True)
    return df_train, df_test

NLP precess

1. get df from training dataset

2. split it to train and validation sets

3. use train set fit_transform tfidf vectorizer, save the matrix of tfidfed and the tfidf model in pickle

4. use tfidfed to do kmeans lcustering and get the lable for each sample and save the centroids in pickle

5. use that lable as request topic for training the model

6. for validation set, first calculate the similarity of each sample's request type to the

In [None]:
'''
*****************************************************************
From here: can be skipped because the valid cases have been stored in csv file
*****************************************************************
'''
folder = '/Users/haowei/Documents/GN/Capstone/Capstone-project/data/'
filename_original = 'SF311.csv'
dft = import_data(folder, filename_original)
dft_valid_reduced = clean_data(dft)

In [3]:
'''Read data from new cleaned and reduced csv file'''
folder = '/Users/haowei/Documents/GN/Capstone/Capstone-project/data/'
filename_reduced = 'SF311_valid_reduced.csv'

cfdf = get_prep_data(folder+filename_reduced)
cfdf_cp = cfdf.copy()
print 'reduced dataframe shape: ', cfdf.shape

cfdf = add_features(cfdf)

reduced dataframe shape:  (1935020, 14)


In [24]:
'''
*****************************************************************
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
*****************************************************************
'''
'''Read data from new engineered csv file'''
folder = '/Users/haowei/Documents/GN/Capstone/Capstone-project/data/'
filename_engineered = 'SF311_engineered.csv'

df = get_prep_data(folder+filename_engineered)
df_cp = df.copy()

# print 'Engineered dataframe info: ', df.info()
# df.head(2)

In [3]:
df['Opened_Int'] = df['Opened'].astype(float)
filename_engineered = 'SF311_engineered.csv'
df.to_csv(folder+filename_engineered)



In [48]:
'''Read data from new filled csv file'''
folder = '/Users/haowei/Documents/GN/Capstone/Capstone-project/data/'
filename_fill = 'SF311_fill.csv'

df_fill = get_prep_data(folder+filename_fill)


In [57]:
'''Will use KNN to impute the neighborhood, but before doing it need to split the data into train-valid and test set'''
'''Because KNN will use information from the whole dataset, lead to a data leakage'''
'''Do a 80-20% train-test split on dataframe'''
np.random.seed(seed = 111)
df_fill['Flag'] = np.random.random(size = len(df_fill)) >=0.8
df_train = df[~df['Flag']]
df_test = df[df['Flag']]
print len(df_train), len(df_test), len(df_test)*1./len(df), len(df_train)+ len(df_test)

1548369 386651 0.199817572945 1935020


In [58]:
'''write the train and test datasets to csv'''
folder = '/Users/haowei/Documents/GN/Capstone/Capstone-project/data/'
filename_train = 'SF311_train.csv'
filename_test = 'SF311_test.csv'
df_train.to_csv(folder+filename_train)
df_test.to_csv(folder+filename_test)

In [3]:
'''Read data from new train and test csv file'''
folder = '/Users/haowei/Documents/GN/Capstone/Capstone-project/data/'
filename_train = 'SF311_train.csv'
filename_test = 'SF311_test.csv'

df_tr = get_prep_data(folder+filename_train)
# df_te = get_prep_data(folder+filename_test)
# print len(df_tr), len(df_te)
df_tr.head(2)

Unnamed: 0_level_0,Opened,Closed,Status Notes,Responsible Agency,Category,Request Type,Request Details,Address,Supervisor District,Neighborhood,...,Process_days,Process_hours,Day_Of_Week,Month,Year,Weekend,Holiday,Before_Holiday,Opened_Int,Flag
CaseID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
185580,2008-03-20 11:48:32,2013-03-26 18:13:25,Case Completed - resolved:,DPW Ops Queue,Tree Maintenance,Trees - Damaging_Property,Lifted_sidewalk_tree_roots,Intersection of FILLMORE ST and TURK ST,5.0,Western Addition,...,1832.266667,43974.4,3,3,2008,0,0,0,1206013712000000000,False
196699,2008-04-13 14:04:28,2015-06-11 12:00:00,Case Completed - resolved: Request closed by ...,DPW Ops Queue,Tree Maintenance,Trees - Damaging_Property,Lifted_sidewalk_tree_roots,"479 GOLD MINE DR, SAN FRANCISCO, CA, 94131",8.0,Diamond Heights,...,2614.9125,62757.9,6,4,2008,1,0,0,1208095468000000000,False


In [3]:
'''Deal with request type using tf-idf then clustering'''
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from PremodelingProcess import train_vali_split, get_df_for_modeling, \
dump_object_to_pickle, get_df_for_engineer, process_data_for_survival_model, train_test_df_split


'''Check if there is pickle files of dataframe ready for load'''
filename_train_pickle = '../data/SF311_train.pickle'
filename_train = '../data/SF311_train.csv'
df = get_df_for_engineer(filename_train_pickle, filename_train)
print 'dataframe shape: ', df.shape
# print df.head()

#df['Request Type'] = df['Request Type'].apply(lambda x: str(x).lower())
df1 = df[:1000]#run a pilot
df1 = df.copy()#run a pilot
df_tra, df_val = train_test_df_split(df1, test_size = 0.2, random_seed = 222)
series_tra = df_tra['Request Topic'].apply(lambda x: str(x).lower())
series_val = df_val['Request Topic'].apply(lambda x: str(x).lower())
documents_train = list(series_tra)
documents_validation = list(series_val)
print documents_train[:5]

get df from pickle
dataframe shape:  (1547521, 22)
['trees - damaging_property', 'trees - damaging_property', 'trees - damaging_property', 'illegal_dumping', 'illegal_dumping']


In [4]:

# Tokenize and remove stop words

# 1. Create a set of documents.
#documents = [' '.join(article['content']).lower() for article in coll.find()]

# 2. Create a set of tokenized documents. No need to tokenize because there is no punctuation in the phrase
docs = [word_tokenize(content) for content in documents_train]
print docs[:15]

# # 3. Strip out stop words from each tokenized document.
# stop = set(stopwords.words('english'))
# docs = [[word for word in words if word not in stop] for words in docs]

# Stemming / Lemmatization

# 1. Stem using both stemmers and the lemmatizer
porter = PorterStemmer()

# snowball = SnowballStemmer('english')
# wordnet = WordNetLemmatizer()
docs_porter = [[porter.stem(word) for word in words] for words in docs]

# docs_snowball = [[snowball.stem(word) for word in words] for words in docs]
# docs_wordnet = [[wordnet.lemmatize(word) for word in words] for words in docs]

# print docs_porter[:30]
# print '*************'
new_docs = [' '.join(doc) for doc in docs_porter]
# print new_docs[:30]
#3. Create word count vector over the whole corpus.
cv = CountVectorizer(stop_words='english')
vectorized = cv.fit_transform(new_docs)


[['trees', '-', 'damaging_property'], ['trees', '-', 'damaging_property'], ['trees', '-', 'damaging_property'], ['illegal_dumping'], ['illegal_dumping'], ['graffiti'], ['not_offensive', 'graffiti', 'on', 'public', 'property'], ['overflowing_city_receptacle_or_dumpster'], ['sidewalk_cleaning'], ['illegal_dumping'], ['illegal_dumping'], ['sidewalk_cleaning'], ['illegal_dumping'], ['not_offensive', 'graffiti', 'on', 'public', 'property'], ['abandoned', 'vehicle', '-', 'pickuptruck']]


In [5]:
'''Make tfidf model and tfidfed matrix'''
tfidf = TfidfVectorizer(stop_words='english')
tfidfed = tfidf.fit_transform(documents_train)

#print tfidfed
'''save ftidf model to pickle file, will be used to transform the text in test file'''
filename_tfidf_pickle = '../data/SF311_tfidf.pickle'
filename_tfidfed_pickle = '../data/SF311_tfidfed.pickle'
dump_object_to_pickle(tfidf,filename_tfidf_pickle)
dump_object_to_pickle(tfidfed,filename_tfidfed_pickle)
print 'done tfidf'

done tfidf


In [6]:
dense = tfidfed.todense()
dense.shape


(807, 73)

In [10]:
from sklearn.cluster import KMeans
k_means = KMeans(n_clusters=100, n_jobs=-2)
k_means.fit(tfidfed) 
print len(k_means.labels_)
#print k_means.cluster_centers_[:5]
print type(k_means.cluster_centers_)
centers = k_means.cluster_centers_

807
<type 'numpy.ndarray'>


In [15]:
df_tra['kmeans'] = k_means.labels_
df_tra.to_csv('../data/SF311_df_train_kmeans.csv')
#df_tra.head(30)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [14]:
df_tra.head(1)

Unnamed: 0_level_0,Opened,Closed,Status Notes,Responsible Agency,Category,Request Details,Address,Supervisor District,Neighborhood,Point,...,Day_Of_Week,Month,Year,Weekend,Holiday,Before_Holiday,Opened_Int,Request Topic,Open Time,kmeans
CaseID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
185580,2008-03-20 11:48:32,2013-03-26 18:13:25,Case Completed - resolved:,DPW Ops Queue,Tree Maintenance,Lifted_sidewalk_tree_roots,Intersection of FILLMORE ST and TURK ST,5.0,Western Addition,"(37.7804961587438, -122.432140368666)",...,3,3,2008,0,0,0,1206013712000000000,Trees - Damaging_Property,120.601371,17


In [12]:

# def predict_kmeans_label(tfidfed, cluster_centers ):
#     labels=[]
#     for row in tfidfed:
#         for center in cluster_centers:
#             min_dis = 
print documents_validation[:5]
valid_tfidfed = tfidf.transform(documents_validation)
valid_dense = valid_tfidfed.todense()
valid_dense.shape      

['sidewalk_cleaning', 'sign repair - bent', 'sidewalk_cleaning', 'illegal_dumping', 'sfha priority - emergency']


(193, 73)

In [18]:
from sklearn.metrics.pairwise import linear_kernel
cosine_similarities = linear_kernel(valid_dense, centers)
cosine_similarities.shape
type(cosine_similarities)
validation_labels = np.argmax(cosine_similarities, axis =1)
print validation_labels.shape
print validation_labels[:10]
cosine_similarities[:10]
df_val['kmeans'] = validation_labels
print df_val.head(1)
df_val.to_csv('../data/SF311_df_test_kmeans.csv')

(193,)
[ 1 65  1  0 13  1 13  3  0  0]
                    Opened              Closed Status Notes  \
CaseID                                                        
240222 2008-07-01 03:03:00 2008-07-01 06:10:00          NaN   

       Responsible Agency                      Category Request Details  \
CaseID                                                                    
240222      DPW Ops Queue  Street and Sidewalk Cleaning         Garbage   

                                        Address  Supervisor District  \
CaseID                                                                 
240222  Intersection of BROOK ST and MISSION ST                  9.0   

          Neighborhood                                  Point   ...    \
CaseID                                                          ...     
240222  Bernal Heights  (37.7406417656081, -122.423090064246)   ...     

       Day_Of_Week  Month  Year  Weekend  Holiday  Before_Holiday  \
CaseID                                 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [19]:
df_train_kmeans = pd.read_csv('../data/SF311_df_train_kmeans.csv')
df_test_kmeans = pd.read_csv('../data/SF311_df_test_kmeans.csv')
print df_train_kmeans.shape
print df_train_kmeans.info()

(807, 24)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 807 entries, 0 to 806
Data columns (total 24 columns):
CaseID                 807 non-null int64
Opened                 807 non-null object
Closed                 807 non-null object
Status Notes           3 non-null object
Responsible Agency     807 non-null object
Category               807 non-null object
Request Details        807 non-null object
Address                807 non-null object
Supervisor District    807 non-null float64
Neighborhood           807 non-null object
Point                  807 non-null object
Source                 807 non-null object
Process_days           807 non-null float64
Process_hours          807 non-null float64
Day_Of_Week            807 non-null int64
Month                  807 non-null int64
Year                   807 non-null int64
Weekend                807 non-null int64
Holiday                807 non-null int64
Before_Holiday         807 non-null int64
Opened_Int             807 non-