In [1]:
words_list = [
    "bad bad bad nice",
    "bad bad",
    "bad bad bad",
    "bad bad bad bad",
    "bad bad bad good good",
    "bad bad bad nice nice"
]

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
counter = CountVectorizer()

In [3]:
X = counter.fit_transform(words_list)
X

<6x3 sparse matrix of type '<class 'numpy.int64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [4]:
X.toarray()

array([[3, 0, 1],
       [2, 0, 0],
       [3, 0, 0],
       [4, 0, 0],
       [3, 2, 0],
       [3, 0, 2]], dtype=int64)

## bad가 X.toarray()의 0번째 열을 나타내고, good이 1번째 열을 나타낸다

In [5]:
counter.get_feature_names()

['bad', 'good', 'nice']

## IDF 계산
### n 은 전체 문장 수
### df(X)  는 X라는 문자가 나온 문장의 수

- df(bad) = 6
- df(nice) = 2
- df(good) = 1

### IDF를 계산하면
$$
 IDF(X) = \log{ \frac{1+n}{1+\mathbf{df(X)}}} + 1
$$

### TF-IDF 계산
$$
 \mathbf{tf-idf} = tf * idf
$$

### TF-IDF를 L2 계산을 해주면
- y형태와 같이 계산
![](https://freesouls.github.io/imgs/1.png)

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

  return f(*args, **kwds)
  return f(*args, **kwds)


In [8]:
vectorizer = TfidfVectorizer()

In [10]:
tfidf_vector = vectorizer.fit_transform(words_list)
tfidf_vector.toarray()

array([[0.85151335, 0.        , 0.52433293],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.55422893, 0.83236428, 0.        ],
       [0.63035731, 0.        , 0.77630514]])

In [11]:
vectorizer.get_feature_names()

['bad', 'good', 'nice']

In [13]:
df01 = pd.DataFrame(tfidf_vector.toarray(), columns=vectorizer.get_feature_names())
df01

Unnamed: 0,bad,good,nice
0,0.851513,0.0,0.524333
1,1.0,0.0,0.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,0.554229,0.832364,0.0
5,0.630357,0.0,0.776305


In [14]:
new_text = [
    "today is bad bad bad nice",
    "columns are bad bad bad good",
    "god is nice nice nice good"
]

In [15]:
new_X = vectorizer.transform(new_text)

In [16]:
new_X.toarray()

array([[0.85151335, 0.        , 0.52433293],
       [0.79964636, 0.60047123, 0.        ],
       [0.        , 0.37657342, 0.92638678]])

In [17]:
df02 = pd.DataFrame(new_X.toarray(), columns = vectorizer.get_feature_names())
df02

Unnamed: 0,bad,good,nice
0,0.851513,0.0,0.524333
1,0.799646,0.600471,0.0
2,0.0,0.376573,0.926387


## 영어 뉴스 데이터 가져오기

In [18]:
import http.client, urllib.request, urllib.parse, urllib.error, base64
import json
import pandas as pd
import numpy as np

In [21]:
df = pd.DataFrame(columns=('name', 'description', 'category'))
df

Unnamed: 0,name,description,category


In [31]:
# headers = {
#     'Ocp-Apim-Subscription-Key':'8cb58112421523421192c87d'
# }

In [32]:
params = urllib.parse.urlencode({
    # World,Business, Entertainment, Health, Politics, ScienceAndTechnology, Sports, UK 가 있음
    'Category' : 'World',
    'Market' : 'en-GB',
    'Count':100
})

In [33]:
params

'Category=World&Market=en-GB&Count=100'

In [42]:
try:
    conn = http.client.HTTPSConnection('api.cognitive.microsoft.com')
    conn.request('GET','/bing/v7.0/news/?%s' %params, '{body}', headers)
    response = conn.getresponse()
    data = response.read()
    print("data : " ,data)
    conn.close()
except Exception as e:
    print(f"Error {e.errno},   {e.strerror}")


data :  b'{"_type": "News", "webSearchUrl": "https:\\/\\/www.bing.com\\/news\\/search?q=&nvaug=%5bNewsVertical+CategoryBrowseRequest%3d%221%22+Category%3d%22rt_World%22+EnableCategoryPagination%3d%221%22+ResultsPerPage%3d%22100%22%5d&form=TNSA02", "value": [{"name": "Coronavirus: A Cameroon student on how he recovered", "url": "https:\\/\\/www.bbc.co.uk\\/news\\/world-africa-51502711", "image": {"thumbnail": {"contentUrl": "https:\\/\\/www.bing.com\\/th?id=ON.948A409B725B8BBE5C4593EA563FF872&pid=News", "width": 700, "height": 393}}, "description": "When Kem Senou Pavel Daryl, a 21-year-old Cameroonian student living in the Chinese city of Jingzhou, contracted the coronavirus he had no intention of leaving China, even if that were possible. \\"No matter what happens I don\'t want to take the sickness back to Africa,\\" he said from his university dormitory, where he is now under a 14-day ...", "provider": [{"_type": "Organization", "name": "BBC", "image": {"thumbnail": {"contentUrl": "h

In [43]:
data = data.decode('utf-8')
data

'{"_type": "News", "webSearchUrl": "https:\\/\\/www.bing.com\\/news\\/search?q=&nvaug=%5bNewsVertical+CategoryBrowseRequest%3d%221%22+Category%3d%22rt_World%22+EnableCategoryPagination%3d%221%22+ResultsPerPage%3d%22100%22%5d&form=TNSA02", "value": [{"name": "Coronavirus: A Cameroon student on how he recovered", "url": "https:\\/\\/www.bbc.co.uk\\/news\\/world-africa-51502711", "image": {"thumbnail": {"contentUrl": "https:\\/\\/www.bing.com\\/th?id=ON.948A409B725B8BBE5C4593EA563FF872&pid=News", "width": 700, "height": 393}}, "description": "When Kem Senou Pavel Daryl, a 21-year-old Cameroonian student living in the Chinese city of Jingzhou, contracted the coronavirus he had no intention of leaving China, even if that were possible. \\"No matter what happens I don\'t want to take the sickness back to Africa,\\" he said from his university dormitory, where he is now under a 14-day ...", "provider": [{"_type": "Organization", "name": "BBC", "image": {"thumbnail": {"contentUrl": "https:\\/\

In [46]:
obj = json.loads(data)
obj

{'_type': 'News',
 'webSearchUrl': 'https://www.bing.com/news/search?q=&nvaug=%5bNewsVertical+CategoryBrowseRequest%3d%221%22+Category%3d%22rt_World%22+EnableCategoryPagination%3d%221%22+ResultsPerPage%3d%22100%22%5d&form=TNSA02',
 'value': [{'name': 'Coronavirus: A Cameroon student on how he recovered',
   'url': 'https://www.bbc.co.uk/news/world-africa-51502711',
   'image': {'thumbnail': {'contentUrl': 'https://www.bing.com/th?id=ON.948A409B725B8BBE5C4593EA563FF872&pid=News',
     'width': 700,
     'height': 393}},
   'description': 'When Kem Senou Pavel Daryl, a 21-year-old Cameroonian student living in the Chinese city of Jingzhou, contracted the coronavirus he had no intention of leaving China, even if that were possible. "No matter what happens I don\'t want to take the sickness back to Africa," he said from his university dormitory, where he is now under a 14-day ...',
   'provider': [{'_type': 'Organization',
     'name': 'BBC',
     'image': {'thumbnail': {'contentUrl': 'htt

In [47]:
val = obj['value']
val

[{'name': 'Coronavirus: A Cameroon student on how he recovered',
  'url': 'https://www.bbc.co.uk/news/world-africa-51502711',
  'image': {'thumbnail': {'contentUrl': 'https://www.bing.com/th?id=ON.948A409B725B8BBE5C4593EA563FF872&pid=News',
    'width': 700,
    'height': 393}},
  'description': 'When Kem Senou Pavel Daryl, a 21-year-old Cameroonian student living in the Chinese city of Jingzhou, contracted the coronavirus he had no intention of leaving China, even if that were possible. "No matter what happens I don\'t want to take the sickness back to Africa," he said from his university dormitory, where he is now under a 14-day ...',
  'provider': [{'_type': 'Organization',
    'name': 'BBC',
    'image': {'thumbnail': {'contentUrl': 'https://www.bing.com/th?id=AR_b639c1691c4fa767d85fd87b7042f9e6&pid=news'}}}],
  'datePublished': '2020-02-17T06:56:00.0000000Z',
  'category': 'World',
  'ampUrl': 'https://www.bbc.co.uk/news/amp/world-africa-51502711'},
 {'name': "Coronavirus: America

In [49]:
val[0].keys()

dict_keys(['name', 'url', 'image', 'description', 'provider', 'datePublished', 'category', 'ampUrl'])

In [51]:
l = len(df)

df.loc[l + 0] = [val[0]["name"],
                val[0]["description"],
                val[0]["category"]
                ]

In [52]:
df

Unnamed: 0,name,description,category
0,Coronavirus: A Cameroon student on how he reco...,"When Kem Senou Pavel Daryl, a 21-year-old Came...",World


In [53]:
category_list = ["Business", "Entertainment", "Health", "Politics",
                "ScienceAndTechnology", "Sports", "UK"]

## category_list 에 따른 api 요청 후, dataframe에 저장

In [55]:
for category in category_list:

    params = urllib.parse.urlencode({
    # World,Business, Entertainment, Health, Politics, ScienceAndTechnology, Sports, UK 가 있음
    'Category' : category,
    'Market' : 'en-GB',
    'Count':100
    })
    try:
        conn = http.client.HTTPSConnection('api.cognitive.microsoft.com')
        conn.request('GET','/bing/v7.0/news/?%s' %params, '{body}', headers)
        response = conn.getresponse()
        data = response.read()
        conn.close()
    except Exception as e:
        print(f"Error {e.errno},   {e.strerror}")
    
    data = data.decode("utf-8")
    obj = json.loads(data)
    val = obj["value"]
    
    l = len(df)

    for i in range(len(val)):
        df.loc[l + i] = [val[i]["name"],
                        val[i]["description"],
                        val[i]["category"]
                        ]

In [56]:
df

Unnamed: 0,name,description,category
0,Coronavirus: A Cameroon student on how he reco...,"When Kem Senou Pavel Daryl, a 21-year-old Came...",World
1,GM scraps historic Holden car brand in Australia,General Motors has said it will retire the ico...,Business
2,Japan on brink of recession as economy contrac...,TOKYO (Reuters) - Japan’s economy shrank at th...,Business
3,Europe to open higher after China pledges fres...,Rishi Sunak urged to suspend radical tax shake...,Business
4,Is it finally time to buy shares in BT for tha...,The BT (LSE: BT.A) share price has been drifti...,Business
...,...,...,...
675,"PSNI seize £655,000 worth of heroin and cannab...",The results came from searches at locations ac...,UK
676,Flood warnings in force and roads closed after...,Flood alerts and warnings remain in force acro...,UK
677,Armed Forces arrive in Calderdale to support f...,UK Armed Forces personnel has been deployed to...,UK
678,Body discovered on East Wittering Beach near S...,A body has been discovered on East Wittering B...,UK


In [57]:
df.sample(frac=0.5)

Unnamed: 0,name,description,category
27,Xerox to hold a dinner for HP shareholders to ...,Xerox to hold a dinner for HP shareholders to ...,Business
61,Freshney Place's Virgin Media store to close,The Virgin Media store at Freshney Place will ...,Business
645,Man set to appear in court charged with murder...,The trial of a Stoke-on-Trent man charged with...,UK
48,Jupiter in 'advanced discussions' to acquire M...,"In a statement issued on 15 February, the boar...",Business
601,Lifeboat team buoyant after Beer fundraiser,During the event members also sold £83.25 of c...,UK
...,...,...,...
367,Voice of the Mirror: Victims of floods are rig...,Our flooded families deserve help and the vict...,Politics
133,When is Venom 2 released in cinemas? Who’s in ...,Following the success of Tom Hardy’s Spider-Ma...,Entertainment
433,Self-driving cars could harness the homing pro...,Bee brains could hold the secrets to future au...,ScienceAndTechnology
481,Twenty20 World Cup: The questions facing Engla...,Victory in the decider at Centurion on Sunday ...,Sports


## dataframe 내용 csv 파일로 저장

In [58]:
df.to_csv("bing_news_shuffle.csv")