In [1]:
from __future__ import print_function
import pandas as pd
import numpy as np
from scipy import  stats
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.graphics.api import qqplot

In [290]:
selected_features = ['loc-id', 'time-stamp']

In [288]:
def loc_time_counts(train_data):
    for loc in range(1, 37):
        locs = []
        loc_time_count = train_data[train_data['loc-id'] == loc]['time-stamp'].value_counts()
        locs.extend([loc] * np.shape(loc_time_count)[0])
        loc_time_submission = pd.DataFrame({'loc-id': locs, 'time-stamp': loc_time_count.keys(), 'numOfPeople': loc_time_count.values})
        loc_time_submission.to_csv('./December/Dec/loc_time_counts%d.csv' %loc, columns = ['loc-id', 'time-stamp', 'numOfPeople'], header = False, index = False)

In [289]:
train_data = pd.read_csv('test1.csv')
loc_time_counts(train_data)

In [294]:
Dec_loc_time_num = pd.read_csv('./December/Dec_loc_time_num.csv')
for loc in range(1, 37):
    for day in range(1, 31):
        for hour in range(0, 24):
            day_to_str = str(day)
            if day < 10:
                day_to_str = '0' + day_to_str
            hour_to_str = str(hour)
            if hour < 10:
                hour_to_str = '0' + hour_to_str
            time = int('11' + day_to_str + hour_to_str)
            if Dec_loc_time_num[Dec_loc_time_num['loc-id'] == loc][Dec_loc_time_num['time-stamp'] == time].empty:
                Dec_loc_time_num = Dec_loc_time_num.append(pd.Series({'loc-id': loc, 'time-stamp': time, 'numOfPeople': np.NaN}), ignore_index = True)



In [295]:
Dec_loc_time_num['loc-id'] = Dec_loc_time_num['loc-id'].astype(int)
Dec_loc_time_num['time-stamp'] = Dec_loc_time_num['time-stamp'].astype(int)
Dec_loc_time_num = Dec_loc_time_num.sort_values(by = selected_features)
Dec_loc_time_num.to_csv('./December/Dec_loc_time_num_fillna.csv', columns = ['loc-id', 'time-stamp', 'numOfPeople'], index = False)

In [297]:
loc_time_num = pd.read_csv('loc_time_num_fillna_diff.csv')
loc_time_num = loc_time_num.append(Dec_loc_time_num)
loc_time_num = loc_time_num.sort_values(by = selected_features)
loc_time_num.to_csv('./December/all_loc_time_num_fillna_diff.csv', columns = ['loc-id', 'time-stamp', 'numOfPeople'], index = False)

In [580]:
#生成与星期有关数据的训练集
#locs: 预测地点，默认为全部
#endDate: 产生训练集截止日期后一天，默认为2015.11.1
def generateData(all_train_data, locs = range(1, 37), endDate = '12/1/2015'):
    t = pd.date_range('7/1/2015', endDate, freq='h')
    t = t[:-1]
    week_data = pd.DataFrame()
    for loc in range(1, 37):
        dt = all_train_data[all_train_data['loc-id'] == loc]['numOfPeople']
        dt = dt[:len(t)]
        dt.index = pd.Index(t)
#         for month in range(7, 11):
        for month in range(7, 12):
            for weekday in range(7):
                for hour in range(24):
                    nums = []
                    for time in t:
                        if time.month == month and time.weekday() == weekday and time.hour == hour:
                            if not pd.isnull(dt[time]):
                                nums.append(dt[time])
                            else:
                                nums.append(np.NaN)
                    week_data = week_data.append(pd.Series({'loc-id': loc, 'month': month, 'weekday': weekday, 'hour': hour, 'numOfPeople': nums}), ignore_index = True)
    week_data['loc-id'] = week_data['loc-id'].astype(int)
    week_data['month'] = week_data['month'].astype(int)
    week_data['weekday'] = week_data['weekday'].astype(int)
    week_data['hour'] = week_data['hour'].astype(int)
    week_data.sort_values(by = ['loc-id', 'month', 'weekday', 'hour'])
    return week_data

In [581]:
all_train_data = pd.read_csv('./December/all_loc_time_num_fillna_all_diff_four.csv')
data = generateData(all_train_data, endDate = '12/1/2015')
# all_train_data = pd.read_csv('loc_time_num_fillna_diff.csv')
# data = generateData(all_train_data, endDate = '11/1/2015')
data.head()

Unnamed: 0,hour,loc-id,month,numOfPeople,weekday
0,0,1,7,"[7.0, 4.0, 5.0, nan]",0
1,1,1,7,"[2.0, 1.0, 3.0, nan]",0
2,2,1,7,"[3.0, 1.0, 1.0, nan]",0
3,3,1,7,"[2.0, 1.0, 1.0, nan]",0
4,4,1,7,"[3.0, 3.0, 1.0, nan]",0


In [507]:
#时间序列预测
def doArma(trainData, p, q):
    t = pd.date_range('11/1/2015', periods = len(trainData) + 1, freq='h')
    tTrain = t[:-1]
    tPredict = t[-1:]
    dt = pd.Series(trainData)
    dt.index = pd.Index(tTrain)
    dt = dt.astype(float)
    arma_mod = sm.tsa.ARMA(dt, (p, q)).fit()
    prediction = arma_mod.predict(str(tPredict.values[0]), str(tPredict.values[0]), dynamic=True)
    return prediction.values[0]

In [586]:
#加权均值预测
#trainData: generateData产生的训练集
#startDate: 预测起始日期
#endDate: 预测截止日期后一天
#locs: 预测地点，默认为全部
#delta: 九十月权值 or 非节日权重
#doArmaOrNot: 是否做时间序列预测
def meanPredict(trainData, startDate, endDate, locs = range(1, 37), delta = 0.8, doArmaOrNot = False):
# def meanPredict(trainData, startDate, endDate, locs = range(1, 37), doArmaOrNot = False):
#     delta = [0.65, 1.02, 1.19, 0.78, 0.94, 1.16, 0.78, 1.05, 1.11, 0.73,\
#             0.76, 1.14, 0.69, 0.71, 1.0, 1.19, 1.19, 0.6, 0.76, 0.71,\
#             1.0, 1.05, 1.17, 0.6, 0.78, 0.71, 1.17, 0.97, 0.83, 0.79,\
#             0.72, 1.15, 0.6, 0.87, 1.11, 0.82]
    t = pd.date_range(startDate, endDate, freq='h')
    t = t[:-1]
    prediction = pd.DataFrame()
    for loc in locs:
        count = 0
        numsInAWeek = []
        
        #Arma预测
        armaTrainData = []
#         armaPredictData = []
        
        for time in t:
            data = trainData[trainData['loc-id'] == loc][trainData['weekday'] == time.weekday()]\
            [trainData['hour'] == time.hour][['month', 'numOfPeople']]
#             numsJulToAug = []
#             numsSepToOct = []
            numsOfSep = []
            numsOfOct = []
#             holiday = []
#             for month in range(9, 11):
            for month in range(10, 12):
                nums = data[data['month'] == month]['numOfPeople'].values[0]
                leng = len(nums)
                for day in range(leng):
                    if not pd.isnull(nums[day]):
                        if month == 10 and day == 0:
                            numsOfSep.append(nums[day])
                        elif leng >= 3 and (nums[day] > np.sum(nums) * 0.35 or nums[day] < np.sum(nums) * 0.05):
                            numsOfSep.append(nums[day])
#                             print(loc, time, nums[day])
                        else:
                            numsOfOct.append(nums[day])
                        
#                         if month == 10 and day == 0:
#                             holiday.append(nums[day])
#                         else:
#                             numsSepToOct.append(nums[day])

#                         if month < 9:
#                             numsJulToAug.append(nums[day])
#                         else:
#                             numsSepToOct.append(nums[day])
            if count == 168:
#                 numsSepToOct.append(numsInAWeek[0])
                numsOfOct.append(numsInAWeek[0])
                numsInAWeek = numsInAWeek[1:]
            else:
                count += 1
#             lenJulToAug = len(numsJulToAug)
#             if lenJulToAug == 0:
#                 lenJulToAug = 1
            
            if doArmaOrNot:
                try:
    #                 armaPredictData.append(doArma(armaTrainData, 1, 0))
                    armaPredictData = doArma(armaTrainData, 1, 0)
#                     numsSepToOct.append(armaPredictData)
                    numsOfOct.append(armaPredictData)
                except:
                    pass
            
#             lenHoliday = len(holiday)
#             if lenHoliday == 0:
#                 lenHoliday = 1
#             lenSepToOct = len(numsSepToOct)
#             if lenSepToOct == 0:
#                 lenSepToOct = 1

            lenSep = len(numsOfSep)
            if lenSep == 0:
                lenSep = 1
            lenOct = len(numsOfOct)
            if lenOct == 0:
                lenOct = 1
             
            #根据权重计算人数
            numOfPeople = np.sum(numsOfSep) / lenSep * (1 - delta) + np.sum(numsOfOct) / lenOct * delta
#             numOfPeople = np.sum(numsOfSep) / lenSep * (1 - delta[loc - 1]) + np.sum(numsOfOct) / lenOct * delta[loc - 1]
#             numOfPeople = np.sum(holiday) / lenHoliday * (1 - delta) + np.sum(numsSepToOct) / lenSepToOct * delta
#             numOfPeople = np.sum(holiday) / lenHoliday * (1 - delta[loc - 1]) + np.sum(numsSepToOct) / lenSepToOct * delta[loc - 1]
            
            if doArmaOrNot:
                armaTrainData.append(numOfPeople)
    
            numsInAWeek.append(numOfPeople)
            day_to_str = str(time.day)
            if time.day < 10:
                day_to_str = '0' + day_to_str
            hour_to_str = str(time.hour)
            if time.hour < 10:
                hour_to_str = '0' + hour_to_str
            prediction = prediction.append(pd.Series({'loc-id': loc, 'time-stamp': str(time.month) + day_to_str + hour_to_str, 'numOfPeople': numOfPeople}), ignore_index = True)
        
#         print('plot')
#         t = pd.date_range('11/1/2015', periods = len(armaTrainData), freq='h')
#         armaTrainData = pd.Series(armaTrainData)
#         armaTrainData.index = pd.Index(t)
#         t = pd.date_range('11/1/2015', periods = len(armaPredictData), freq='h')
#         armaPredictData = pd.Series(armaPredictData)
#         armaPredictData.index = pd.Index(t)
#         plt.rc('figure', figsize=(12, 8))
#         plt.plot(armaTrainData)
#         plt.plot(armaPredictData, 'r')
#         plt.show()
        
    prediction['loc-id'] = prediction['loc-id'].astype(int)
    prediction['time-stamp'] = prediction['time-stamp'].astype(int)
    prediction['numOfPeople'] = prediction['numOfPeople'].astype(int)
    prediction.sort_values(by = ['loc-id', 'time-stamp'])
    return prediction

In [595]:
mean_prediction = meanPredict(data, '12/1/2015', '1/1/2016', delta = 0.9)
# # mean_prediction = meanPredict(data, '12/1/2015', '1/1/2016')
# mean_prediction = meanPredict(data, '11/1/2015', '12/1/2015', delta = 0.76)
mean_prediction.head()



Unnamed: 0,loc-id,numOfPeople,time-stamp
0,1,9,120100
1,1,4,120101
2,1,3,120102
3,1,3,120103
4,1,2,120104


In [596]:
mean_prediction.to_csv('./December/result/result.csv', columns = ['loc-id', 'time-stamp', 'numOfPeople'], header = False, index = False)
# mean_prediction.to_csv('./result/result.csv', columns = ['loc-id', 'time-stamp', 'numOfPeople'], header = False, index = False)

In [366]:
from math import sqrt


# sol_file: 提交的答案
# ans_file: 正确的答案
def count_score(sol_file, ans_file):
    sol_map = dict()
    with open(sol_file) as fp:
        lines = fp.readlines()
        for line in lines:
            nums = line[:-1].split(',')
            key = int(nums[0]) * 1000000 + int(nums[1])
            val = float(nums[2])
            sol_map[key] = val

    ans_map = dict()
    with open(ans_file) as fp:
        lines = fp.readlines()
        for line in lines:
            nums = line[:-1].split(',')
            key = int(nums[0]) * 1000000 + int(nums[1])
            val = float(nums[2])
            ans_map[key] = val

    square_count = 0.0
    for key, ans in ans_map.items():
        sol = sol_map[key]
        square_count += pow(ans - sol, 2)

    if len(ans_map) == 0:
        return 0
    return sqrt(square_count / len(ans_map))


In [598]:
count_score('./December/result/result_63.3.csv', './December/result/result_61.6.csv')
# count_score('./result/result.csv', './December/Dec_loc_time_num.csv')

9.027009203567388

In [355]:
Dec_train_data = pd.read_csv('./December/Dec_loc_time_num.csv')
for loc in range(1, 37):
    loc_data = Dec_train_data[Dec_train_data['loc-id'] == loc]
    loc_data.to_csv('./December/locs/%d.csv' %loc, columns = ['loc-id', 'time-stamp', 'numOfPeople'], header = False, index = False)

In [450]:
Dec_train_data = pd.read_csv('loc_time_num_fillna_diff.csv')
for loc in range(1, 37):
    print(loc, end = ' ')
    loc_train_data = generateData(Dec_train_data, locs = range(loc, loc + 1), endDate = '11/1/2015')
    m = 1000000
    d = 0;
    for weight in range(60, 120):
        dlt = weight / 100
        predict_loc_data = meanPredict(loc_train_data, '11/1/2015', '12/1/2015', locs = range(loc, loc + 1), delta = dlt)
        predict_loc_data.to_csv('./December/delta/%d.csv' %loc, columns = ['loc-id', 'time-stamp', 'numOfPeople'], header = False, index = False)
        score = count_score('./December/delta/%d.csv' %loc, './December/locs/%d.csv' %loc)
        if score < m:
            m = score
            d = dlt
    print(d, m)

1 



0.65 65.92370957304335
2 1.02 35.59231587913728
3 1.19 95.124681528882
4 0.78 50.866039327183046
5 0.94 34.610099578096495
6 1.16 70.65619161682226
7 0.78 87.1970975030112
8 1.05 144.09838937797204
9 1.11 49.71448654480697
10 0.73 35.16069993819641
11 0.6 0
12 1.14 79.99861509592643
13 0.69 31.233859140915207
14 0.71 36.944376520950236
15 1.0 37.15288266941722
16 1.19 75.6714595086725
17 1.19 55.41043716451432
18 0.6 31.341253408281375
19 0.76 108.91194461046096
20 0.71 57.838276141758925
21 1.0 51.53509665315822
22 1.05 40.44449078097503
23 1.17 31.96445867999388
24 0.6 42.17847169088046
25 0.78 21.221195311169033
26 0.71 58.7203239352247
27 1.17 80.8172427402589
28 0.97 16.951871986757737
29 0.83 131.89759381706216
30 0.79 33.849831017419184
31 0.72 143.47302943208038
32 1.15 82.22369264502724
33 0.6 44.97557140405653
34 0.87 91.23655710578949
35 1.11 66.43365740146233
36 0.82 41.532116087031575


In [183]:
for loc in range(1, 37):
    Dec_arma_train_data = generateData(locs = range(loc, loc + 1), endDate = '10/15/2015')
    do_arma = meanPredict(Dec_arma_train_data, '10/15/2015', '11/1/2015', locs = range(loc, loc + 1), delta = 0.8, doArmaOrNot = True)
    do_arma.to_csv('./doarma/%d.csv' %loc, columns = ['loc-id', 'time-stamp', 'numOfPeople'], header = False, index = False)
    not_do_arma = meanPredict(Dec_arma_train_data, '10/15/2015', '11/1/2015', locs = range(loc, loc + 1), delta = 0.8, doArmaOrNot = False)
    not_do_arma.to_csv('./notdoarma/%d.csv' %loc, columns = ['loc-id', 'time-stamp', 'numOfPeople'], header = False, index = False)
    do_arma_score = count_score('./doarma/%d.csv' %loc, './locs/%d.csv' %loc)
    not_do_arma_score = count_score('./notdoarma/%d.csv' %loc, './locs/%d.csv' %loc)
    print(loc, do_arma_score, not_do_arma_score)

  return TimeSeries(result, index=self.predict_dates)


1 202.6923351191391 237.04659124263543
2 62.553917483980115 110.35375216676572




3 87.8890855455027 175.6020332287587




4 121.30984713676659 182.07270966550223
5 68.78594458782626 97.81795891562936
6 134.8840772134142 207.07110698925067
7 208.0985818142123 309.88719970855504




8 199.94001569606624 288.0875852759425
9 60.347454552250674 105.45832412891274
10 86.38728472965776 123.29455846933452
11 66.13674444298904 97.44710324598923




12 90.07315271214932 164.30498591053689
13 74.66696428512114 123.38614484917154
14 93.49488663643854 138.86820290396855
15 71.22121684157791 106.62384557145012




16 96.15675216881195 162.48576005983483




17 31.895972887180495 53.92333377978336
18 65.55333304786424 92.51240315137265
19 311.7520113534387 396.0681251994346




20 165.65858091900571 209.60208037795195
21 148.65704170802914 196.17636308204607
22 102.32339196493574 145.24532673470634
23 51.16921817742468 83.90177973067858
24 53.8883585023039 81.55615343102558
25 39.5558676641869 60.830832013517536
26 138.98730544326762 191.016275827098




27 80.90487329059658 127.29997349265838
28 32.24648919764399 43.082174625845006
29 394.082820148002 430.230796516945




30 77.94961990441803 88.31004962553733




31 493.98595721530984 656.8505908998119
32 108.52079776152601 170.76566924620255
33 167.8228379884426 189.41228148205602
34 278.7610508245191 366.8522022171939




35 112.64890282928626 135.63589604635013




36 84.70962001557669 110.57373183993738
