## 非监督学习基本理解为聚类

1. lazy learning:  Target function will be approximately locally
2. eager learning: Dataset with few attributes

In [5]:
import numpy as np

In [6]:
import random

In [6]:
numbers = [random.randint(0, 100) for i in range(100)]

## Mean 平均

In [7]:
np.mean(numbers)

51.96

## 中位数

In [10]:
np.percentile(numbers, 50)

53.0

## 众数

In [12]:
from collections import Counter

In [13]:
Counter(numbers).most_common(1)

[(53, 5)]

In [14]:
from scipy import stats

In [16]:
stats.mode(numbers)

ModeResult(mode=array([53]), count=array([5]))

## 标准差、方差

In [19]:
numbers = [random.randint(0, 1000) for i in range(100)]

In [27]:
std = np.std(numbers)

In [28]:
mean = np.mean(numbers)

## 数据的Normalization

## $x_{normalized} = \frac{x -\bar{X}}{std}$  
会使得程序容易计算

In [29]:
[(n - mean) / std for n in numbers]

[0.6580487451102447,
 -0.13553590465774343,
 0.7450646058304188,
 0.9956702847045203,
 0.2925821300855133,
 1.228872791434587,
 -0.6750342411228231,
 1.1592601028584477,
 0.8146772944065581,
 0.03153454792499091,
 -1.5939217303278619,
 0.7833515845472954,
 0.7555065091168397,
 -0.45575427210798425,
 -0.6367472624059464,
 -0.46271554096559814,
 -0.45575427210798425,
 -1.273703362877621,
 -1.8306048714867356,
 -0.6506698001211743,
 1.5456105244560208,
 0.3552335498040387,
 0.15683738736204167,
 -0.2956450883828638,
 0.6719712828254725,
 -1.2319357497319374,
 0.15683738736204167,
 -0.9151980167105036,
 -0.37918031467423097,
 0.18120182836369042,
 -0.6437085312635603,
 1.4516333948782327,
 1.1035699519975362,
 0.8703674452674696,
 -1.6322087090447386,
 0.9678252092740646,
 1.228872791434587,
 -0.6541504345499812,
 -1.8027597960562798,
 -1.479060794177232,
 -1.385083664599444,
 1.127934392999185,
 0.52578463681558,
 -1.5939217303278619,
 0.6580487451102447,
 -1.0439814905763614,
 1.14185693

## Outlier

In [30]:
np.percentile(numbers, 25) / 1.5

214.5

In [31]:
np.percentile(numbers, 75) * 1.5

1193.625

In [36]:
[n for n in numbers if n > (np.percentile(numbers, 25) / 1.5) and n < np.percentile(numbers, 75) * 1.5]

[732,
 504,
 757,
 829,
 627,
 896,
 349,
 876,
 777,
 552,
 768,
 760,
 412,
 360,
 410,
 412,
 356,
 987,
 645,
 588,
 458,
 736,
 588,
 280,
 434,
 595,
 358,
 960,
 860,
 793,
 821,
 896,
 355,
 867,
 694,
 732,
 243,
 871,
 655,
 929,
 953,
 481,
 610,
 961,
 271,
 552,
 538,
 748,
 245,
 582,
 592,
 920,
 414,
 739,
 563,
 770,
 376,
 575,
 984,
 294,
 592,
 227,
 833,
 274,
 810,
 728,
 508,
 278,
 682,
 331,
 873,
 424,
 957,
 806,
 804,
 339,
 886,
 869,
 668,
 358,
 979,
 969,
 444]

# 掐头去尾会不会降低模型的泛化能力
基于现有的技术水平只能去掉，目前来说

---
1. model:就是对数据一种抽象，往往抽象成函数  
2. classification and regression:  
3. categorical and numerical:
4. outlier:
---
# 半监督学习就是数据量很少的时候用

---
分类不写成0123，用softmax处理  


---

# Mean Squared Error: 
## $ Loss_n = \frac{1}{n}(\sum{y_i-f(x_i)}^2)$

---
# Logistic Regression:  
## $ p = \frac{1}{1+e^{-(w*x+b)}}$

https://github.com/SSaishruthi

1. represent the words content?
   + tfidf
   + Word Embedding
2. Propercessing, scaling, normalization
3. Baseline,evaluation (必须比87%高)
4. Try different models, monitor the performance
5. Based on the performance, tuning the parameters
---
Unbalance:  
1. use sub dataset 
2. P,R,F1,F2

In [2]:
file = '/Users/zzy/Projects/select_from_sqlResult_1558435.csv'

In [3]:
import pandas as pd

In [4]:
content = pd.read_csv(file,  delimiter='\t', encoding='utf-8', error_bad_lines=False, usecols=[0, 2, 3])

In [7]:
content.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89609 entries, 0 to 89608
Data columns (total 3 columns):
id         89609 non-null int64
source     89607 non-null object
content    87052 non-null object
dtypes: int64(1), object(2)
memory usage: 2.1+ MB


In [8]:
content.describe()

Unnamed: 0,id
count,89609.0
mean,44808.102758
std,25869.603643
min,1.0
25%,22405.0
50%,44808.0
75%,67210.0
max,89617.0


In [9]:
news_csv = content.dropna(axis=0)

In [10]:
news_csv.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 87050 entries, 0 to 89608
Data columns (total 3 columns):
id         87050 non-null int64
source     87050 non-null object
content    87050 non-null object
dtypes: int64(1), object(2)
memory usage: 2.7+ MB


In [11]:
news_csv.loc[news_csv['source'].str.contains('新华')]

Unnamed: 0,id,source,content
3,89614,新华社,这是6月18日在葡萄牙中部大佩德罗冈地区拍摄的被森林大火烧毁的汽车。新华社记者张立云摄\n
17,89600,新华网,国歌法草案首次提请最高立法机关审议\n点击图片进入下一页\n新闻：十二届全国人大常委会第二十...
25,89592,新华网,当地时间2017年6月17日，土耳其马尼萨省，在土耳其第1步兵训练大队司令部军营，590名土...
27,89590,新华网,英国与欧盟的“脱欧”谈判于19日正式开始。然而此时，英国首相特雷莎·梅正面临着空前的政治压力...
39,89578,新华社,这是6月18日在葡萄牙中部大佩德罗冈地区拍摄的被森林大火烧毁的汽车。新华社记者张立云摄\n
51,89566,新华网,戈壁的大漠黄沙曾掩埋了无数西域古道，而如今一条大漠天路正顽强地与黄沙“搏斗”，在乌兰布和、腾...
65,89552,新华社@,从百度、支付宝到App?“高考志愿填报”五花八门\n顾女士近来减少了刷朋友圈的频率，一有时间...
95,89522,新华社,新华社韩国济州6月18日电综述：亚投行第二届年会三大亮点\n新华社记者 耿学鹏 严蕾\n亚洲...
104,89513,新华网,粤港澳概念板块午后走强，珠海港、恒基达鑫、白云机场等个股的午后纷纷快速拉升。\n据悉，首届粤...
113,89504,新华社,新华社北京6月18日电 经军委领导批准，《军营理论热点怎么看·2017》日前印发全军。\n为...


In [12]:
news_csv['is_xinhua'] = np.where(news_csv['source'].str.contains('新华'), 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [13]:
news_csv.head()

Unnamed: 0,id,source,content,is_xinhua
0,89617,快科技@http://www.kkj.cn/,此外，自本周（6月12日）起，除小米手机6等15款机型外，其余机型已暂停更新发布（含开发版/...,0
1,89616,快科技@http://www.kkj.cn/,骁龙835作为唯一通过Windows 10桌面平台认证的ARM处理器，高通强调，不会因为只考...,0
2,89615,快科技@http://www.kkj.cn/,此前的一加3T搭载的是3400mAh电池，DashCharge快充规格为5V/4A。\n至于...,0
3,89614,新华社,这是6月18日在葡萄牙中部大佩德罗冈地区拍摄的被森林大火烧毁的汽车。新华社记者张立云摄\n,1
4,89613,深圳大件事,（原标题：44岁女子跑深圳约会网友被拒，暴雨中裸身奔走……）\n@深圳交警微博称：昨日清晨交...,0


In [14]:
news_csv.loc[news_csv['is_xinhua']==0]

Unnamed: 0,id,source,content,is_xinhua
0,89617,快科技@http://www.kkj.cn/,此外，自本周（6月12日）起，除小米手机6等15款机型外，其余机型已暂停更新发布（含开发版/...,0
1,89616,快科技@http://www.kkj.cn/,骁龙835作为唯一通过Windows 10桌面平台认证的ARM处理器，高通强调，不会因为只考...,0
2,89615,快科技@http://www.kkj.cn/,此前的一加3T搭载的是3400mAh电池，DashCharge快充规格为5V/4A。\n至于...,0
4,89613,深圳大件事,（原标题：44岁女子跑深圳约会网友被拒，暴雨中裸身奔走……）\n@深圳交警微博称：昨日清晨交...,0
5,89612,中国证券报?中证网,受到A股被纳入MSCI指数的利好消息刺激，A股市场从周三开始再度上演龙马行情，周四上午金...,0
6,89611,威锋网@http://www.feng.com/,虽然至今夏普智能手机在市场上无法排得上号，已经完全没落，并于 2013 年退出中国市场，但是...,0
7,89610,中国证券报?中证网,沙漠雄鹰：震荡有利消化套牢筹码\n 周四开盘上证50在银行券商大蓝筹带动下一度涨近2%...,0
8,89609,荆楚网-楚天都市报,（原标题：武汉警方一下子抓了808人，还都是俊男靓女！原来他们每天偷偷摸摸干这事！）\n武汉...,0
9,89608,中国证券报?中证网,6月21日，A股纳入MSCI指数尘埃落定，但当天被寄予厚望的券商股并未扛起反弹大旗。22...,0
10,89607,证券时报网,证券时报网（www.stcn.com）06月23日讯\n 据上证报道，6月初以来，创业...,0


## 提取6000新华社和6000非新华社

In [15]:
X = []
y = []
i = 0
j = 0

In [None]:
content_list= news_csv['content'].tolist()
for k, boolean in enumerate(news_csv['is_xinhua']):
    if boolean == 1 and i < 100:
        X.append(content_list[k])
        Y.append(1)
        i += 1
    elif boolean == 0 and j < 100:
        X.append(content_list[k])
        Y.append(0)

In [29]:
a = np.arange(10)

In [31]:
np.random.shuffle(a)
a

array([5, 3, 2, 1, 4, 9, 7, 0, 8, 6])