In [1]:
words_list = [
    "bad bad bad nice",
    "bad bad",
    "bad bad bad",
    "bad bad bad bad",
    "bad bad bad good good",
    "bad bad bad nice nice"
]

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
counter = CountVectorizer()

In [3]:
X = counter.fit_transform(words_list)
X

<6x3 sparse matrix of type '<class 'numpy.int64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [4]:
X.toarray()

array([[3, 0, 1],
       [2, 0, 0],
       [3, 0, 0],
       [4, 0, 0],
       [3, 2, 0],
       [3, 0, 2]], dtype=int64)

## bad가 X.toarray()의 0번째 열을 나타내고, good이 1번째 열을 나타낸다

In [5]:
counter.get_feature_names()

['bad', 'good', 'nice']

## IDF 계산
### n 은 전체 문장 수
### df(X)  는 X라는 문자가 나온 문장의 수

- df(bad) = 6
- df(nice) = 2
- df(good) = 1

### IDF를 계산하면
$$
 IDF(X) = \log{ \frac{1+n}{1+\mathbf{df(X)}}} + 1
$$

### TF-IDF 계산
$$
 \mathbf{tf-idf} = tf * idf
$$

### TF-IDF를 L2 계산을 해주면
- y형태와 같이 계산
![](https://freesouls.github.io/imgs/1.png)

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

In [7]:
vectorizer = TfidfVectorizer()

In [8]:
tfidf_vector = vectorizer.fit_transform(words_list)
tfidf_vector.toarray()

array([[0.85151335, 0.        , 0.52433293],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.55422893, 0.83236428, 0.        ],
       [0.63035731, 0.        , 0.77630514]])

In [9]:
vectorizer.get_feature_names()

['bad', 'good', 'nice']

In [10]:
df01 = pd.DataFrame(tfidf_vector.toarray(), columns=vectorizer.get_feature_names())
df01

Unnamed: 0,bad,good,nice
0,0.851513,0.0,0.524333
1,1.0,0.0,0.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,0.554229,0.832364,0.0
5,0.630357,0.0,0.776305


In [11]:
new_text = [
    "today is bad bad bad nice",
    "columns are bad bad bad good",
    "god is nice nice nice good"
]

In [12]:
new_X = vectorizer.transform(new_text)

In [13]:
new_X.toarray()

array([[0.85151335, 0.        , 0.52433293],
       [0.79964636, 0.60047123, 0.        ],
       [0.        , 0.37657342, 0.92638678]])

In [14]:
df02 = pd.DataFrame(new_X.toarray(), columns = vectorizer.get_feature_names())
df02

Unnamed: 0,bad,good,nice
0,0.851513,0.0,0.524333
1,0.799646,0.600471,0.0
2,0.0,0.376573,0.926387


## 영어 뉴스 데이터 가져오기

In [15]:
import http.client, urllib.request, urllib.parse, urllib.error, base64
import json
import pandas as pd
import numpy as np

In [16]:
df = pd.DataFrame(columns=('name', 'description', 'category'))
df

Unnamed: 0,name,description,category


In [21]:
# headers = {
#     'Ocp-Apim-Subscription-Key':'8cb58112421523421192c87d'
# }

In [22]:
params = urllib.parse.urlencode({
    # World,Business, Entertainment, Health, Politics, ScienceAndTechnology, Sports, UK 가 있음
    'Category' : 'World',
    'Market' : 'en-GB',
    'Count':100
})

In [23]:
params

'Category=World&Market=en-GB&Count=100'

In [24]:
try:
    conn = http.client.HTTPSConnection('api.cognitive.microsoft.com')
    conn.request('GET','/bing/v7.0/news/?%s' %params, '{body}', headers)
    response = conn.getresponse()
    data = response.read()
    print("data : " ,data)
    conn.close()
except Exception as e:
    print(f"Error {e.errno},   {e.strerror}")


data :  b'{"_type": "News", "webSearchUrl": "https:\\/\\/www.bing.com\\/news\\/search?q=&nvaug=%5bNewsVertical+CategoryBrowseRequest%3d%221%22+Category%3d%22rt_World%22+EnableCategoryPagination%3d%221%22+ResultsPerPage%3d%22100%22%5d&form=TNSA02", "value": [{"name": "Coronavirus: Largest study suggests elderly and sick are most at risk", "url": "https:\\/\\/www.bbc.co.uk\\/news\\/world-asia-china-51540981", "image": {"thumbnail": {"contentUrl": "https:\\/\\/www.bing.com\\/th?id=ON.B1F3EBD5C38A00121B1C39B98AA86E2B&pid=News", "width": 700, "height": 393}}, "description": "Health officials in China have published the first details of more than 70,000 cases of Covid-19, in the biggest study since the outbreak began. Data from the Chinese Centre for Disease Control and Prevention (CCDC) found that more than 80% of the cases have been mild with the sick and elderly most at risk. The research also points to the high ...", "provider": [{"_type": "Organization", "name": "BBC", "image": {"thumbn

In [25]:
data = data.decode('utf-8')
data

'{"_type": "News", "webSearchUrl": "https:\\/\\/www.bing.com\\/news\\/search?q=&nvaug=%5bNewsVertical+CategoryBrowseRequest%3d%221%22+Category%3d%22rt_World%22+EnableCategoryPagination%3d%221%22+ResultsPerPage%3d%22100%22%5d&form=TNSA02", "value": [{"name": "Coronavirus: Largest study suggests elderly and sick are most at risk", "url": "https:\\/\\/www.bbc.co.uk\\/news\\/world-asia-china-51540981", "image": {"thumbnail": {"contentUrl": "https:\\/\\/www.bing.com\\/th?id=ON.B1F3EBD5C38A00121B1C39B98AA86E2B&pid=News", "width": 700, "height": 393}}, "description": "Health officials in China have published the first details of more than 70,000 cases of Covid-19, in the biggest study since the outbreak began. Data from the Chinese Centre for Disease Control and Prevention (CCDC) found that more than 80% of the cases have been mild with the sick and elderly most at risk. The research also points to the high ...", "provider": [{"_type": "Organization", "name": "BBC", "image": {"thumbnail": {"c

In [26]:
obj = json.loads(data)
obj

{'_type': 'News',
 'webSearchUrl': 'https://www.bing.com/news/search?q=&nvaug=%5bNewsVertical+CategoryBrowseRequest%3d%221%22+Category%3d%22rt_World%22+EnableCategoryPagination%3d%221%22+ResultsPerPage%3d%22100%22%5d&form=TNSA02',
 'value': [{'name': 'Coronavirus: Largest study suggests elderly and sick are most at risk',
   'url': 'https://www.bbc.co.uk/news/world-asia-china-51540981',
   'image': {'thumbnail': {'contentUrl': 'https://www.bing.com/th?id=ON.B1F3EBD5C38A00121B1C39B98AA86E2B&pid=News',
     'width': 700,
     'height': 393}},
   'description': 'Health officials in China have published the first details of more than 70,000 cases of Covid-19, in the biggest study since the outbreak began. Data from the Chinese Centre for Disease Control and Prevention (CCDC) found that more than 80% of the cases have been mild with the sick and elderly most at risk. The research also points to the high ...',
   'provider': [{'_type': 'Organization',
     'name': 'BBC',
     'image': {'thum

In [27]:
val = obj['value']
val

[{'name': 'Coronavirus: Largest study suggests elderly and sick are most at risk',
  'url': 'https://www.bbc.co.uk/news/world-asia-china-51540981',
  'image': {'thumbnail': {'contentUrl': 'https://www.bing.com/th?id=ON.B1F3EBD5C38A00121B1C39B98AA86E2B&pid=News',
    'width': 700,
    'height': 393}},
  'description': 'Health officials in China have published the first details of more than 70,000 cases of Covid-19, in the biggest study since the outbreak began. Data from the Chinese Centre for Disease Control and Prevention (CCDC) found that more than 80% of the cases have been mild with the sick and elderly most at risk. The research also points to the high ...',
  'provider': [{'_type': 'Organization',
    'name': 'BBC',
    'image': {'thumbnail': {'contentUrl': 'https://www.bing.com/th?id=AR_b639c1691c4fa767d85fd87b7042f9e6&pid=news'}}}],
  'datePublished': '2020-02-18T05:55:00.0000000Z',
  'category': 'World',
  'ampUrl': 'https://www.bbc.co.uk/news/amp/world-asia-china-51540981'},


In [28]:
val[0].keys()

dict_keys(['name', 'url', 'image', 'description', 'provider', 'datePublished', 'category', 'ampUrl'])

In [29]:
l = len(df)

df.loc[l + 0] = [val[0]["name"],
                val[0]["description"],
                val[0]["category"]
                ]

In [30]:
df

Unnamed: 0,name,description,category
0,Coronavirus: Largest study suggests elderly an...,Health officials in China have published the f...,World


In [31]:
category_list = ["World","Business", "Entertainment", "Health", "Politics",
                "ScienceAndTechnology", "Sports", "UK"]

## category_list 에 따른 api 요청 후, dataframe에 저장

In [32]:
for category in category_list:

    params = urllib.parse.urlencode({
    # World,Business, Entertainment, Health, Politics, ScienceAndTechnology, Sports, UK 가 있음
    'Category' : category,
    'Market' : 'en-GB',
    'Count':100
    })
    try:
        conn = http.client.HTTPSConnection('api.cognitive.microsoft.com')
        conn.request('GET','/bing/v7.0/news/?%s' %params, '{body}', headers)
        response = conn.getresponse()
        data = response.read()
        conn.close()
    except Exception as e:
        print(f"Error {e.errno},   {e.strerror}")
    
    data = data.decode("utf-8")
    obj = json.loads(data)
    val = obj["value"]
    
    l = len(df)

    for i in range(len(val)):
        df.loc[l + i] = [val[i]["name"],
                        val[i]["description"],
                        val[i]["category"]
                        ]

In [33]:
df

Unnamed: 0,name,description,category
0,Coronavirus: Largest study suggests elderly an...,Health officials in China have published the f...,World
1,Coronavirus: Largest study suggests elderly an...,Health officials in China have published the f...,World
2,'From bad to worse': Dashed hopes may deter ma...,"DUBAI (Reuters) - Confrontation with America, ...",World
3,Caroline Flack’s best friend shares heartbreak...,CAROLINE Flack's close friend who was with her...,World
4,Healthcare reform gives Sanders a headache in ...,Bernie Sanders heads in to Saturday’s Nevada c...,World
...,...,...,...
704,"Over 674,000 people sign two separate petition...",Hundreds of thousands of people have signed th...,UK
705,Love Island star Caroline Flack said she would...,The former Love Island presenter was found dea...,UK
706,Police investigate after girl heard screaming ...,Police are looking to identify a girl who was ...,UK
707,Mum and son forced to flee home as they 'lose ...,A mother and her son have been forced to move ...,UK


In [34]:
df.sample(frac=0.5)

Unnamed: 0,name,description,category
441,Vestas debuts EnVentus giant in Sweden,Vestas has secured a 39MW order for its new tu...,ScienceAndTechnology
252,Stroke Association Resolution Run Worthing 202...,"Whether you are taking part, volunteering or w...",Health
144,Victoria Beckham won't let model daughter Harp...,Victoria reveals that she drew a line at eight...,Entertainment
645,"Do you recognise this man, suspected to be inv...",Detectives are appealing for information follo...,UK
302,Potholed Bradley Stoke road to be resurfaced,One of the most potholed sections of highway i...,Health
...,...,...,...
669,Work starts on restoration of Ipswich's Unitar...,The Unitarian Meeting House was opened in 1700...,UK
151,Kim Kardashian 'breaks her no dance rule' for ...,Kim Kardashian has shown that she still posses...,Entertainment
291,Leeds surgeon suspended as almost '50 patients...,Mike Walsh had been working at the private Spi...,Health
567,Dagenham & Redbridge's clash with league leade...,The Daggers were due to take on Ian Evatt's si...,Sports


## dataframe 내용 csv 파일로 저장

In [35]:
df.to_csv("bing_news_shuffle.csv")