In [1]:
import pandas as pd
import xgboost as xgb


In [2]:
def process_data(data):
    data['Dates'] = pd.to_datetime(data['Dates'])     # 转化为实践序列
    
    # 提取年、月、日、星期几、小时、时刻    作为新创建的特征
    data['year'] = data['Dates'].dt.year
    data['month'] = data['Dates'].dt.month
    data['day'] = data['Dates'].dt.day
    data['wday'] = data['Dates'].dt.dayofweek
    data['hour'] = data['Dates'].dt.hour + data['Dates'].dt.minute/60
    data['qtr'] = data['Dates'].dt.quarter
    
    
    # one-hot-code
    dummy_dayofweef = pd.get_dummies(data['DayOfWeek'],prefix='wday')
    data = data.join(dummy_dayofweef)
    
    # 新增特征，春夏秋冬
    def season(x):
        spring = 0
        summer = 0
        fall = 0
        winter = 0
        if x in [2,3,4]:
            spring = 1
        if x in [5,6,7]:
            summer = 1
        if x in [8,9,10]:
            fall = 1
        if x in [11,12,1]:
            winter = 1
        return spring,summer,fall,winter
    data['spring'],data['summer'],data['fall'],data['winter'] = zip(*data['hour'].apply(season))
    
    # 新增特征，凌晨、上午、下午、晚上
    def time(x):
        emorning = 0
        morning = 0
        afternoon = 0
        night = 0
        if x>=0 or x<6:
            emorning = 1
        if x>=6 or x<12:
            morning = 1
        if x>=12 or x<18:
            afternoon = 1
        if x>18 or x<=24:
            night = 1
        return emorning,morning,afternoon,night
    data['emorning'],data['morning'],data['afternoon'],data['night'] = zip(*data['hour'].apply(time))
    
    #删除旧特征
    data.drop(['Dates','DayOfWeek','wday'],axis=1,inplace=True)
    return data
     

In [3]:
def process_address(train,test):
    from copy import deepcopy
    # 案发地址集合
    addresses = sorted(train['Address'].unique())
    # 案发类型集合
    categories = sorted(train['Category'].unique())
    # 每类案件数量
    c_count = train.groupby(['Category']).size()
    # 每个案发地址中，每一类案件的发生数量
    a_c_count = train.groupby(['Address','Category']).size()
    # 每个案发地址的案件数量
    a_count = train.groupby(['Address']).size()
    # 测试集的案发地址集合
    new_addresses = sorted(test['Address'].unique())
    # 测试集的每个案发地点的案件数量
    new_a_count = test.groupby(['Address']).size()
    # 只在测试集中出现的新的案发地址集合
    only_new=set(new_addresses+addresses)-set(addresses)
    # 同时在测试集和训练集的案发地址集合
    in_both = set(new_addresses).intersection(addresses)
    
    # 存放每个地址，发生每一类案件的概率
    logodds={}
    # 存放每个地址，发生案件的几率
    logoddsPA={}
    
    # 最小记录数量
    min_counts = 2
    # 默认地址发生案件的几率
    default_logodds = np.log(c_count/len(train)) - np.log(1.0-c_count/float(len(train)))
    
    
    # 迭代每一个地址，对logodds和logoddsPA进行填充
    # 训练集案发地址
    for addr in addresses:
        pa = a_count[addr] / float(len(train))           # addr地址的案件数量 / 所有地址案件数量 = addr地址发生案件的概率
        logoddsPA[addr] = np.log(pa) - np.log(1-pa)
        logodds[addr] = deepcopy(default_logodds)
        for cat in a_c_count[addr].keys():
            if (a_c_count[addr][cat]>min_counts and a_c_count[addr][cat]<a_count[addr]):
                pa = a_c_count[addr][cat] / float(a_count[addr])                        # addr地址中发生cat案件 / addr地址中所有案件
                logodds[addr][categories.index(cat)] = np.log(pa) - np.log(1.0 - pa)     
        logodds[addr] = pd.Series(logodds[addr])
        logodds[addr].index = range(len(categories))
    
    # 在测试集中出现的新的案发地址
    for addr in only_new:
        PA = new_a_count[addr] / float(len(test)+len(train))
        logoddsPA[addr] = np.log(PA) - np.log(1.0-PA)
        logodds[addr] = deepcopy(default_logodds)
        logodds[addr].index = range(len(categories))
        
    # 训练集、训练集都出现的地址
    for addr in in_both:
        PA = (a_count[addr]+new_a_count[addr]) / float(len(test)+len(train))
        logoddsPA[addr] = np.log(PA) - np.log(1.0-PA)
        
    # 新增特征：案发地址发生案件的总概率
    train['logoddsPA'] = train['Address'].apply(lambda x: logoddsPA[x])
    test['loggoddsPA'] = test['Address'].apply(lambda x: logoddsPA[x])
    
    
    # 新增特征: 案发地址发生每一类案件的概率
    address_features = train['Address'].apply(lambda x: logodds[x])
    address_features.columns = ['logodds' + str(x) for x in range(len(address_features.columns))]
    train = train.join(address_features)
    address_features = test['Address'].apply(lambda x: logodds[x])
    address_features.columns = ['logodds' + str(x) for x in range(len(address_features.columns))]
    test = test.join(address_features)
    
    train.drop('Address',axis=1,inplace=True)
    test.drop('Address',axis=1,inplace=True)
    return train,test

    
    

In [4]:
def process_ss(train,test):
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    scaler.fit(train)
    
    train = scaler.transform(train)
    test = scaler.transform(test)
    return train,test


In [5]:
if __name__ == '__main__':
    import pandas as pd
    import numpy as np
    train = pd.read_csv('train.csv')
    test = pd.read_csv('test.csv')
    test.drop('Id', axis=1, inplace=True)
    train.drop(['Descript','Resolution'], axis=1, inplace=True)
    
    # 经纬度排除
    train = train[train['Y'] < 38]
    
    # 日期处理
    train = process_data(train)
    test = process_data(test)
    
    # PdDistrict处理
    dummy_pdd_train = pd.get_dummies(train['PdDistrict'], prefix='pdd')
    train = train.join(dummy_pdd_train)
    dummy_pdd_test = pd.get_dummies(test['PdDistrict'], prefix='pdd')
    test = test.join(dummy_pdd_test)
    train.drop('PdDistrict', axis=1, inplace=True)
    test.drop('PdDistrict', axis=1, inplace=True)
    
    # 犯罪地点处理
    # 判断是否发生在交叉路口
    train['interaction'] = train['Address'].apply(lambda x : 1 if '/' in x else 0)
    test['interaction'] = test['Address'].apply(lambda x : 1 if '/' in x else 0)
    
    train,test = process_address(train, test)
    
    # 目标变量处理
    target = train['Category']
    num = len(target.unique())       # 犯罪类型的数量
    name = list(target.unique())     # 犯罪类型字符串
    for i in range(0,num):
        target[target.values == name[i]] = i
    target_dict = pd.DataFrame(name)
    train.drop('Category', axis=1, inplace=True)
    
    train,test = process_ss(train,test)
    
    target.to_csv('处理后的数据target.csv', header=False, index=False)
    target_dict.to_csv('处理后的数据target_dict.csv', header=False, index=False)
    pd.DataFrame(train).to_csv('处理后的数据train.csv', header=False, index=False)
    pd.DataFrame(test).to_csv('处理后的数据test.csv', header=False, index=False)
    
    
    
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return self.partial_fit(X, y)
  
  import sys
