## 从kaggle上下载titanic的数据, 使用pandas读取数据

In [148]:
CSV_FILE="./train.csv"

In [241]:
import pandas as pd

df=pd.read_csv(CSV_FILE, delimiter=',', header=0)
columns_missing = pd.isnull(df).any()  #找到有缺失值的列
print(columns_missing)
#添充缺失值
df['Age'] = df['Age'].fillna(-1) #数字
df['Cabin'] = df['Cabin'].fillna('')
df['Embarked'] = df['Embarked'].fillna('') 
print(df.head(2))
print(df.columns)
print(df.shape[0])

PassengerId    False
Survived       False
Pclass         False
Name           False
Sex            False
Age             True
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin           True
Embarked        True
dtype: bool
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   

   Parch     Ticket     Fare Cabin Embarked  
0      0  A/5 21171   7.2500              S  
1      0   PC 17599  71.2833   C85        C  
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
891


## 拼装bulk脚本

In [242]:
#========================create bulk data==================
INDEX_NAME="titanic2"
TYPE_NAME="passenger"
ID_FIELD="PassengerId"

bulk_data=[]
headers = df.columns
for index, row in df.iterrows():
    data_dict = {}
    for i in range(len(row)):
        data_dict[headers[i]] = row[i]
    op_dict = {
        "index":{
            "_index": INDEX_NAME,
            "_type": TYPE_NAME,
            "_id": data_dict[ID_FIELD]
        }
    }
    bulk_data.append(op_dict)
    bulk_data.append(data_dict)


## 向elasticsearch中写数据

In [243]:
#====================create elasticsearch index========#
from elasticsearch import Elasticsearch
es = Elasticsearch(hosts=[{"host":"192.168.18.187", "port":9201}])
if es.indices.exists(INDEX_NAME):
    print("deleting %s index...." % (INDEX_NAME))
    res = es.indices.delete(index=INDEX_NAME)
    print("response:%s" % res)
request = {
    "settings": {
        "number_of_shards":1,
        "number_of_replicas":0
    }
}
print("creating %s index..." % INDEX_NAME)
res = es.indices.create(index=INDEX_NAME, body=request)
print("response:%s" % res)
    

deleting titanic2 index....
response:{'acknowledged': True}
creating titanic2 index...
response:{'acknowledged': True, 'shards_acknowledged': True}


In [244]:
#===========push data============
print("bulking indexing....")
res = es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
res = es.search(index=INDEX_NAME, size=2, body={"query":{"match_all":{}}})
print("response:%s " % res)

bulking indexing....
response:{'took': 0, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'failed': 0}, 'hits': {'total': 891, 'max_score': 1.0, 'hits': [{'_index': 'titanic2', '_type': 'passenger', '_id': '1', '_score': 1.0, '_source': {'PassengerId': 1, 'Survived': 0, 'Pclass': 3, 'Name': 'Braund, Mr. Owen Harris', 'Sex': 'male', 'Age': 22.0, 'SibSp': 1, 'Parch': 0, 'Ticket': 'A/5 21171', 'Fare': 7.25, 'Cabin': '', 'Embarked': 'S'}}, {'_index': 'titanic2', '_type': 'passenger', '_id': '2', '_score': 1.0, '_source': {'PassengerId': 2, 'Survived': 1, 'Pclass': 1, 'Name': 'Cumings, Mrs. John Bradley (Florence Briggs Thayer)', 'Sex': 'female', 'Age': 38.0, 'SibSp': 1, 'Parch': 0, 'Ticket': 'PC 17599', 'Fare': 71.2833, 'Cabin': 'C85', 'Embarked': 'C'}}]}} 


## 读取己存入elasticsearch的数据

In [245]:
from pandas.io.json import json_normalize

res = es.search(index=INDEX_NAME, doc_type=TYPE_NAME, body={"query":{"match_all":{}}}) #size=1 for test, only return 10 records default
print(res['hits']['total'])
df = json_normalize(res['hits']['hits'])
#filter df by column name
df2 = df.filter(regex="^_source\.")
#print(df2)
#rename column name
df3 = df2.rename(columns=lambda x:x.replace("_source.", ""))
#print(df3)
df = df3
print(df.shape)

891
(10, 12)


In [184]:
#res = es.search(index=INDEX_NAME, doc_type=TYPE_NAME, scroll="1m", search_type="query_then_fetch", body={"query":{"match_all":{}}})
res = es.search(index=INDEX_NAME, doc_type=TYPE_NAME, size=891, body={"query":{"match_all":{}}})  #todo:how to scroll
print(res['hits']['total'])
df = json_normalize(res['hits']['hits'])
df2 = df.filter(regex="^_source\.")
#print(df2)
#rename column name
df3 = df2.rename(columns=lambda x:x.replace("_source.", ""))
#print(df3)
df = df3
print(df.shape)

'''
# Start scrolling
while(scroll_size >0):
    print "Scrolling..."
    page = es.scroll(scroll_id = sid, scroll ='2m')
    # Update the scroll ID
    sid = page['_scroll_id']
    # Get the number of results that we returned in the last scroll
    scroll_size = len(page['hits']['hits'])
    print "scroll size: "+ str(scroll_size)
    # Do something with the obtained page
'''

891
(891, 12)


'\n# Start scrolling\nwhile(scroll_size >0):\n    print "Scrolling..."\n    page = es.scroll(scroll_id = sid, scroll =\'2m\')\n    # Update the scroll ID\n    sid = page[\'_scroll_id\']\n    # Get the number of results that we returned in the last scroll\n    scroll_size = len(page[\'hits\'][\'hits\'])\n    print "scroll size: "+ str(scroll_size)\n    # Do something with the obtained page\n'

In [222]:
from sklearn import svm
clf = svm.SVC()
clf.fit(train_x, train_y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [223]:
from sklearn.metrics import *

predict = clf.predict(valid_x)
cm = confusion_matrix(valid_y, predict)
print(cm)
print(accuracy_score(valid_y, predict))
print(classification_report(valid_y, predict))

[[145  21]
 [ 78  24]]
0.630597014925
             precision    recall  f1-score   support

          0       0.65      0.87      0.75       166
          1       0.53      0.24      0.33       102

avg / total       0.61      0.63      0.59       268

