# Data Processing and model building

일단 빠른 진행을 위해 reweight 을 적용하지 않기로 한 상태.(tfidf 관련 feature 들은 모두 사용되지 않게 될 것 같음.)

이렇게 진행하기 위해선 기존의 코드를 전면적으로 수정해야하기 때문에 필요한 코드들을 여기에 복붙하고 수정해서 사용하는 방식으로 진행함.

수정된 부분은 최대한 그 목차에서 설명하겠음



> 주의 : 수정하고 싶으면 반드시 사본을 만들어서 거기에 수정하기



# 0) Import


In [0]:
from collections import Counter, defaultdict
from itertools import combinations
import nltk

from nltk import sent_tokenize, word_tokenize, pos_tag, ne_chunk
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tree import Tree

import os
import numpy as np

import matplotlib.pyplot as plt
import networkx as nx
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

from tqdm import tqdm_notebook

import time

# 1) Fake News Detection 코드

여기서 우리 프로세스는

1. 전처리 및 Co-Occurrence 데이터 프레임 생성
- 여기서 전처리를 진행할때 입력값은 TiCNN 데이터 셋의 각 본문이 될 것임.
- 각 기사에 대한 Co-Occurrence 데이터 프레임이 그대로 메모리에 남아있으면 2만개나 되니 메모리 터질 우려가 있어 일단 우리 공유 드라이브에 각각을 저장해둘거임 (아래 코드 참조)


2. 만들어진 데이터프레임을 불러와서 각각 원래 만들어둔 피쳐 만들고 Ti CNN 데이터 프레임에 추가해주기 

- 내가 진행못함

## nltk download(시작하기 전에 실행)

In [0]:
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

## Cooccurrence.py (수정사항 확인)(새로 돌릴 시 진행)

원본에서 수정사항

1. tag filter 없이 tag 의 첫글자가 (startswith 함수 이용) V, N, R, J 인 경우만 선택하는 것으로 바꿈(select result 함수)

2. tag filter 변수 init에서 전부 삭제

3. sentent.replace 에 Donald J. Trump 추가





In [0]:
class Processing():
    def __init__(self):
        self.tag_map = defaultdict(lambda: wn.NOUN)
        self.tag_map['J'] = wn.ADJ
        self.tag_map['V'] = wn.VERB
        self.tag_map['R'] = wn.ADV

    @staticmethod
    def apply_collocations(sentence):
        sentence = sentence.replace("George H.W. Bush", "George_H.W._Bush")
        sentence = sentence.replace("Donald J. Trump", "Donald_J._Trump")
        # add any phrase here
        return sentence

    # 문장 하나 lemmatization 함수
    def lemma_sentence(self, text):  # token에 is, 같은 애들을 be 로 변환 시키지 않음
        results = []
        tokens = word_tokenize(text)
        # NER_chunk 함수 넣어주기 (tokens 단위?)
        tokens = self.ner_chunk(tokens)  #-> pos tag / ne_chunk 포함
        lmtzr = WordNetLemmatizer()
        replace_data={"n't":'not'} #lemmatatization에서 제거 되고 싶지 않은 단어 추가
        for token, tag in pos_tag(tokens):

            # print("token :", token, "tag :", tag)
            if token in replace_data.keys():
                # print("pass replace_Data: ",token)
                token =token.replace(token,replace_data[token])
                # print("after replace: ",token)
            lemma = lmtzr.lemmatize(token, self.tag_map[tag[0]])
            # print(token, "=>", lemma)
            results.append(lemma)
        return results

    # 문서 전체 lemmatization 함수
    def lemma_text(self, text):
        # collocation 을 이 단에서 추가해야할 듯 (sent tokenize 되지 않도록)
        lemma_data = []
        sentences = sent_tokenize(text)
        for sent in sentences:
            lemma_sent = self.lemma_sentence(sent)
            lemma_data.append(lemma_sent)
        return lemma_data

    # 불용어 처리 함수
    # 여기서부턴 string형태가 아니라 이중리스트 형태이므로 sentences 와 sentence 로 구분함
    def stopword(self, sentences):
        stopWords = set(stopwords.words('english'))-set(['not'])
        added_stopword = ['“', '”', '.', ',', '-', "—", "–", "'s", "n't", "''", ';', '&', "``", '?', "‘", "’"]
        results = []

        for sentence in sentences:
            wordsFiltered = []
            wordsStopped = []
            for w in sentence:
                if w not in stopWords and w not in added_stopword and not w.isdigit():
                    wordsFiltered.append(w)
                else:
                    wordsStopped.append(w)
            results.append(wordsFiltered)

        # print(results)
        return results

    # 태깅 함수

    # apply_collocation 수정
    def ner_chunk(self,tokens): #George H.W. Bush는 따로 작업
        chunked = ne_chunk(pos_tag(tokens), binary=True)
        # prev = None
        continuous_chunk = []
        current_chunk = []
        # print("gcc_text in :", text)
        # print("gcc chunked in : ",chunked )
        for i in chunked:
            if type(i) == Tree:
                current_chunk.append("_".join([token for token, pos in i.leaves()]))
                # print("current chunk: ",current_chunk,"\n")
                named_entity = " ".join(current_chunk)
                continuous_chunk.append(named_entity)
                current_chunk = []
            else:
                continuous_chunk.append(i[0])
                continue
        if current_chunk:
            named_entity = " ".join(current_chunk)
            if named_entity not in continuous_chunk:
                continuous_chunk.append(named_entity)
                current_chunk = []
        return continuous_chunk

    def tag_content(self, sentences):
        """
        Tag all words in content
        :param sentences:(list) processed data
        :return: (list) tagged words divided by each sentence
        """
        results = []
        for sentence in sentences:
            tagged_content = pos_tag(sentence)
            results.append(tagged_content)

        return results

    # 태그 결과에서 필터링하는 함수
    def select_results(self, sentences):
        """
        Select word by filtering certain tags
        :param sentences: (list) processed data
        :param tag_filter: (list) tags which should be left
        :return: (list) words divided by each sentence
        """

        selected_results = []

        for sentence in sentences:
            selection = []
            # 단어를 lex, tag 를 cat 이라 표현
            for lex, tag in sentence:
                if tag.startswith('V') or tag.startswith('N') or tag.startswith('R') or tag.startswith('J') :
                    # tag 말고 안에 단어 lex 만 남겨야함
                    selection.append(lex)

            if len(selection) > 0:
                selected_results.append(selection)
        return selected_results

    # Co-occurence matrix 생성 함수
    def create_cooc_mat(self, sentences):
        """
        Create Co-Occurrence Matrix
        :param sentences: (list of list) processed data
        :return: (list) The number of times two words occur together in each sentence in a document. [(word1, word2), count]
        """
        word_cooc_mat = Counter()
        for sentence in sentences:
            for w1, w2 in combinations(sentence, 2):
                if len(w1) == 1 or len(w2) == 1:  # 1음절 단어 제외
                    continue
                if w1 == w2:  # 동일한 단어 벡터는 계산 x.
                    continue
                elif word_cooc_mat[(w2, w1)] >= 1:  # 동일한 조합이고 순서만 반대인 경우
                    word_cooc_mat[(w2, w1)] += 1  # 처음 했던 조합에 카운트하겠다
                    # print(word_cooc_mat[(w2, w1)])
                else:
                    word_cooc_mat[(w1, w2)] += 1
                    # print(word_cooc_mat[(w1, w2)])

        # list_key_value = [[k,v] for k, v in word_cooc_mat.items()]

        list_keys = [k for k in word_cooc_mat.keys()]
        list_values = [v for v in word_cooc_mat.values()]
        conv_list_keys = [[w1, w2] for w1, w2 in list_keys]
        linkages = pd.Series(two_words for two_words in conv_list_keys)
        weights = pd.Series(list_values)
        data = pd.DataFrame({'Linkage': linkages, 'Weight': weights})
        sorted_data = data.sort_values(by=['Weight'], ascending=False)
        # return list_key_value
        return sorted_data

    def cooc(self, filepath=None, text=None):
        if filepath is not None:
            text = open(filepath, encoding='utf-8').read()
        else:
            text = text

        text = self.apply_collocations(text)
        lem_sents = self.lemma_text(text)
        stop_sents = self.stopword(lem_sents)
        tag_sents = self.tag_content(stop_sents)
        sel_sents = self.select_results(tag_sents) # 단어 리스트 - 사용할 품사 종류 합의 필요
        cooc_mat = self.create_cooc_mat(sel_sents) # 단어간 연결 데이터프레임
        return sel_sents, cooc_mat

# 2) Ti CNN 데이터 다루기


데이터를 구글 드라이브에서 불러올건데 미리 우리 공유 폴더에 share drive로 만들어둠. 

데이터를 딱 보면 텍스트에 <U+2019> 등등 이상한 단어들이 보임. 정규표현식을 사용해서 없어준뒤 우리가 원하는 것들 진행할 거임



## Data 불러와서 index reset 시키기 (완료)(새로 돌릴 시 진행)

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
%cd '/content/drive/Shared drives/2019_URP_2/data'
!ls

/content/drive/Shared drives/2019_URP_2/data
all_data_mini.csv  all_data_modified.csv  cooc	cooc_mini  glove.42B  re_cooc


In [0]:
df = pd.read_csv("all_data_modified.csv")
# df = pd.read_csv("all_data_mini.csv", index_col=0)
df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,title,text,caps_title,caps_text,excl_title,excl_text,title_len,text_len,type,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31,Unnamed: 32,Unnamed: 33,Unnamed: 34,Unnamed: 35,Unnamed: 36,Unnamed: 37,Unnamed: 38,Unnamed: 39,...,Unnamed: 120,Unnamed: 121,Unnamed: 122,Unnamed: 123,Unnamed: 124,Unnamed: 125,Unnamed: 126,Unnamed: 127,Unnamed: 128,Unnamed: 129,Unnamed: 130,Unnamed: 131,Unnamed: 132,Unnamed: 133,Unnamed: 134,Unnamed: 135,Unnamed: 136,Unnamed: 137,Unnamed: 138,Unnamed: 139,Unnamed: 140,Unnamed: 141,Unnamed: 142,Unnamed: 143,Unnamed: 144,Unnamed: 145,Unnamed: 146,Unnamed: 147,Unnamed: 148,Unnamed: 149,Unnamed: 150,Unnamed: 151,Unnamed: 152,Unnamed: 153,Unnamed: 154,Unnamed: 155,Unnamed: 156,Unnamed: 157,Unnamed: 158,Unnamed: 159
0,Chappatte on the Email Investigation,"The F.B.I. warning seal, updated for 2016. Pat...",0,0,0,0,36,309,real,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,Ted Cruz Says Only He Can Beat Donald Trump,"MYRTLE BEACH, S.C.<U+2014> Ted Cruz has a simp...",0,2,0,0,43,314,real,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,Donald Trump Gets to Work,A wall of a thousand miles begins with one bri...,0,0,0,0,25,316,real,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,Senators Seek <U+2018>Living Wage<U+2019> for ...,A group of senators is calling for higher wage...,0,0,0,0,56,316,real,,,,,,,,,,,,<U+0447><U+0442><U+043E> <U+0436><U+0435><U+0...,<U+0438><U+043D><U+0442><U+0435><U+0440><U+04...,<U+0432> <U+0445><U+043E><U+0434><U+0435> <U+...,<U+0432><U+0441><U+0442><U+0440><U+0435><U+04...,<U+043A><U+043E><U+0442><U+043E><U+0440><U+04...,<U+043E><U+0442><U+043C><U+0435><U+043D><U+04...,<U+0447><U+0442><U+043E> <U+043F><U+0440><U+0...,<U+0447><U+0442><U+043E><U+0431><U+044B>,<U+0441><U+043B><U+0435><U+0434><U+0443><U+04...,<U+043E><U+0441><U+0432><U+043E><U+0431><U+04...,<U+043D><U+043E><U+0440><U+043C><U+0430><U+04...,<U+0430> 42 <U+044E><U+0436><U+043D><U+043E><...,<U+043F><U+043E><U+0447><U+0442><U+0438> <U+0...,<U+0442><U+0440><U+0435><U+0431><U+0443><U+04...,<U+0447><U+0442><U+043E> <U+043F><U+0440><U+0...,<U+0434><U+0438><U+043F><U+043B><U+043E><U+04...,<U+0442><U+043E> <U+043D><U+0435><U+0438><U+0...,<U+0447><U+0442><U+043E> <U+043D><U+0435><U+0...,<U+0447><U+0442><U+043E> <U+043E><U+043D><U+0...,<U+043D><U+0435> <U+0442><U+043E> <U+043E><U+...,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,Five Reasons the New York Democratic Primary F...,"NEW YORK, N.Y. <U+2014> If Hillary Clinton is ...",0,2,0,0,61,318,real,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [0]:
print(df.index)

RangeIndex(start=0, stop=18418, step=1)


In [0]:
# df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,title,text,caps_title,caps_text,excl_title,excl_text,title_len,text_len,type,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31,Unnamed: 32,Unnamed: 33,Unnamed: 34,Unnamed: 35,Unnamed: 36,Unnamed: 37,Unnamed: 38,Unnamed: 39,...,Unnamed: 120,Unnamed: 121,Unnamed: 122,Unnamed: 123,Unnamed: 124,Unnamed: 125,Unnamed: 126,Unnamed: 127,Unnamed: 128,Unnamed: 129,Unnamed: 130,Unnamed: 131,Unnamed: 132,Unnamed: 133,Unnamed: 134,Unnamed: 135,Unnamed: 136,Unnamed: 137,Unnamed: 138,Unnamed: 139,Unnamed: 140,Unnamed: 141,Unnamed: 142,Unnamed: 143,Unnamed: 144,Unnamed: 145,Unnamed: 146,Unnamed: 147,Unnamed: 148,Unnamed: 149,Unnamed: 150,Unnamed: 151,Unnamed: 152,Unnamed: 153,Unnamed: 154,Unnamed: 155,Unnamed: 156,Unnamed: 157,Unnamed: 158,Unnamed: 159
0,Chappatte on the Email Investigation,"The F.B.I. warning seal, updated for 2016. Pat...",0,0,0,0,36,309,real,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,Ted Cruz Says Only He Can Beat Donald Trump,"MYRTLE BEACH, S.C. Ted Cruz has a simple messa...",0,2,0,0,43,314,real,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,Donald Trump Gets to Work,A wall of a thousand miles begins with one bri...,0,0,0,0,25,316,real,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,Senators Seek Living Wage for Capitol Hill Con...,A group of senators is calling for higher wage...,0,0,0,0,56,316,real,,,,,,,,,,,,<U+0447><U+0442><U+043E> <U+0436><U+0435><U+0...,<U+0438><U+043D><U+0442><U+0435><U+0440><U+04...,<U+0432> <U+0445><U+043E><U+0434><U+0435> <U+...,<U+0432><U+0441><U+0442><U+0440><U+0435><U+04...,<U+043A><U+043E><U+0442><U+043E><U+0440><U+04...,<U+043E><U+0442><U+043C><U+0435><U+043D><U+04...,<U+0447><U+0442><U+043E> <U+043F><U+0440><U+0...,<U+0447><U+0442><U+043E><U+0431><U+044B>,<U+0441><U+043B><U+0435><U+0434><U+0443><U+04...,<U+043E><U+0441><U+0432><U+043E><U+0431><U+04...,<U+043D><U+043E><U+0440><U+043C><U+0430><U+04...,<U+0430> 42 <U+044E><U+0436><U+043D><U+043E><...,<U+043F><U+043E><U+0447><U+0442><U+0438> <U+0...,<U+0442><U+0440><U+0435><U+0431><U+0443><U+04...,<U+0447><U+0442><U+043E> <U+043F><U+0440><U+0...,<U+0434><U+0438><U+043F><U+043B><U+043E><U+04...,<U+0442><U+043E> <U+043D><U+0435><U+0438><U+0...,<U+0447><U+0442><U+043E> <U+043D><U+0435><U+0...,<U+0447><U+0442><U+043E> <U+043E><U+043D><U+0...,<U+043D><U+0435> <U+0442><U+043E> <U+043E><U+...,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,Five Reasons the New York Democratic Primary F...,"NEW YORK, N.Y. If Hillary Clinton is winning ...",0,2,0,0,61,318,real,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


## 정규표현식 연습 (완료)

In [0]:
example = "<U+201C>Trump.<U+201D> Trump monogrammed sweaters, towels and glassware. Trump cologne. <U+201C>"
print(example)

<U+201C>Trump.<U+201D> Trump monogrammed sweaters, towels and glassware. Trump cologne. <U+201C>


In [0]:
type(example)

str

In [0]:
import re
text = re.sub(r'<\b[^>]*>(.*?)', '', example)
print(text)

Trump. Trump monogrammed sweaters, towels and glassware. Trump cologne. 


## title 과 text 정규표현식으로 수정하기 (완료)(새로 돌릴 시 진행)

In [0]:
title = pd.Series(df.loc[:, 'title'].astype(str).str.replace(r'<\b[^>]*>(.*?)', ''))
text = pd.Series(df.loc[:, 'text'].astype(str).str.replace(r'<\b[^>]*>(.*?)', ''))

In [0]:
df['title'] = title
df['text'] = text

In [0]:
df.head()

Unnamed: 0,title,text,caps_title,caps_text,excl_title,excl_text,title_len,text_len,type,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31,Unnamed: 32,Unnamed: 33,Unnamed: 34,Unnamed: 35,Unnamed: 36,Unnamed: 37,Unnamed: 38,Unnamed: 39,...,Unnamed: 120,Unnamed: 121,Unnamed: 122,Unnamed: 123,Unnamed: 124,Unnamed: 125,Unnamed: 126,Unnamed: 127,Unnamed: 128,Unnamed: 129,Unnamed: 130,Unnamed: 131,Unnamed: 132,Unnamed: 133,Unnamed: 134,Unnamed: 135,Unnamed: 136,Unnamed: 137,Unnamed: 138,Unnamed: 139,Unnamed: 140,Unnamed: 141,Unnamed: 142,Unnamed: 143,Unnamed: 144,Unnamed: 145,Unnamed: 146,Unnamed: 147,Unnamed: 148,Unnamed: 149,Unnamed: 150,Unnamed: 151,Unnamed: 152,Unnamed: 153,Unnamed: 154,Unnamed: 155,Unnamed: 156,Unnamed: 157,Unnamed: 158,Unnamed: 159
0,Chappatte on the Email Investigation,"The F.B.I. warning seal, updated for 2016. Pat...",0,0,0,0,36,309,real,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,Ted Cruz Says Only He Can Beat Donald Trump,"MYRTLE BEACH, S.C. Ted Cruz has a simple messa...",0,2,0,0,43,314,real,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,Donald Trump Gets to Work,A wall of a thousand miles begins with one bri...,0,0,0,0,25,316,real,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,Senators Seek Living Wage for Capitol Hill Con...,A group of senators is calling for higher wage...,0,0,0,0,56,316,real,,,,,,,,,,,,<U+0447><U+0442><U+043E> <U+0436><U+0435><U+0...,<U+0438><U+043D><U+0442><U+0435><U+0440><U+04...,<U+0432> <U+0445><U+043E><U+0434><U+0435> <U+...,<U+0432><U+0441><U+0442><U+0440><U+0435><U+04...,<U+043A><U+043E><U+0442><U+043E><U+0440><U+04...,<U+043E><U+0442><U+043C><U+0435><U+043D><U+04...,<U+0447><U+0442><U+043E> <U+043F><U+0440><U+0...,<U+0447><U+0442><U+043E><U+0431><U+044B>,<U+0441><U+043B><U+0435><U+0434><U+0443><U+04...,<U+043E><U+0441><U+0432><U+043E><U+0431><U+04...,<U+043D><U+043E><U+0440><U+043C><U+0430><U+04...,<U+0430> 42 <U+044E><U+0436><U+043D><U+043E><...,<U+043F><U+043E><U+0447><U+0442><U+0438> <U+0...,<U+0442><U+0440><U+0435><U+0431><U+0443><U+04...,<U+0447><U+0442><U+043E> <U+043F><U+0440><U+0...,<U+0434><U+0438><U+043F><U+043B><U+043E><U+04...,<U+0442><U+043E> <U+043D><U+0435><U+0438><U+0...,<U+0447><U+0442><U+043E> <U+043D><U+0435><U+0...,<U+0447><U+0442><U+043E> <U+043E><U+043D><U+0...,<U+043D><U+0435> <U+0442><U+043E> <U+043E><U+...,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,Five Reasons the New York Democratic Primary F...,"NEW YORK, N.Y. If Hillary Clinton is winning ...",0,2,0,0,61,318,real,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


# 3) 기사별로 cooc 진행 


cooc 를 그대로 진행할 건데 혹시 모를 메모리가 터질 문제를 방지하기 위해서 일단 cooc 만들어진 결과를 csv 로 저장해둘 거임. 

위치는 공유 드라이브-2019 URP-data-cooc 임.

## 100개만 테스트 (완료)

각 기사 별로 cooc 를 진행할 것임. 일단 100개만 진행해서 걸리는 시간 부터 확인

In [0]:
import os
import time 

N = Processing()
start = time.time()
os.system('mkdir cooc/')
for i in range(100): # 100 개만 해보기 
    _, cooc = N.cooc(text=df.loc[i, 'text'])
    cooc.to_csv('cooc/' + str(i) + '.csv')
end = time.time()
print("It takes :", end - start)

It takes : 27.878585815429688


In [0]:
df.index

RangeIndex(start=0, stop=19919, step=1)

## 전체 진행중 (완료)

100개 진행시 28초 이므로 20000개 진행시 약 5600초, 정도 걸릴 것이라 예상됨. 내일 아침에 와서 다시 진행할 수 있을 정도 수준이므로 일단 돌려놓기



** 이거까지 끝나면 우리 2019 urp / data 폴더 안에 19919 개의 csv 가 만들어져 있어야함.

In [0]:
import os
import time 

N = Processing()
start = time.time()
os.system('mkdir cooc_mini/')
for i in df.index: # 19919개 진행 
    _, cooc = N.cooc(text=df.loc[i, 'text'])
    cooc.to_csv('cooc_mini/' + str(i) + '.csv')
end = time.time()
print("It takes :", end - start)

It takes : 6.436292409896851


4108 초 걸림

In [0]:
x = os.listdir('./cooc/')
print(len(x))

18418


# 이후 코드 진행 필요

- Features.py 수정본 적용하기
- df 에 새로운 열 feature 들 추가하기 


# 190908 위 수정 및 이후 진행 (완료)

## Graph

In [0]:
class Graph:
    """
    Visualize Co-Occurrence Matrix
    :param matrix: (list) co-occurence matrix data
    """

    def __init__(self):
        self.G = nx.Graph()


    def string_to_list(self, df):
        temp = df.loc[:, 'Linkage'].astype(str).str.split("'").str
        first = temp.get(1).get_values()
        second = temp.get(3).get_values()
        linkage = pd.Series(list(zip(first, second)))
        df['Linkage'] = linkage
        return df

    def create_graph(self, doc_path, string_to_list=False):
        """
        네트워크 이론에 사용할 그래프를 만들어줌
        :param string_to_list: 기존에 만들어진 csv 를 가져오려면 이걸 True로 해야함
        :return:
        """
        # MST(Minum Spanning Tree)-based graph
        # create edge
        matrix = pd.read_csv(doc_path, index_col=0)
        if string_to_list:
            matrix = self.string_to_list(matrix)
        else:
            pass

        for i in range(len(matrix)):
            # print('{0} is the number'.format(len(matrix)))
            # print(matrix['Linkage'][i])
            w1 = matrix.loc[i, 'Linkage'][0]
            w2 = matrix.loc[i, 'Linkage'][1]
            count = matrix.loc[i, 'Weight']
            # i += 1
            # if i > NETWORK_MAX: # 노드 개수 제어
            #     break

            self.G.add_edge(w1, w2, weight=count)

        # create MST model
        self.T = nx.minimum_spanning_tree(self.G)
        nodes = nx.nodes(self.T)
        degrees = nx.degree(self.T)
        # set size of node
        self.node_size = []
        for node in nodes:
            ns = degrees[node] * 100
            self.node_size.append(ns)
        self.pos = nx.fruchterman_reingold_layout(self.G, k=0.5)
        return self.G, matrix

## TFIDF (tfidf_mean, std 추가)


1.   여기선 reweight 을 일단 안한다 하였으므로 cor2tfidf 함수는 필요없음. 삭제함

2.   대신 각 문서별 tfidf 의 평균, 분산을 구하는 건 피쳐를 2개나 늘려주므로 가치가 있다고 판단함. 그에 대해 tfidf_doc 함수 만들어줌

3.   여기서의 인풋은 df Series 형태로 저장된 뉴스 기사들임. init 수정되었고 get_corpus 함수도 그에 맞게 수정됨. 



In [0]:
class CorTfidf(Processing):

    def __init__(self, dataframe):
        super().__init__()
        self.docs_df = dataframe
        
    def doc2list(self, text):
        """
        이중리스트로 토큰화 되어있는 문서를 공백간격으로 합쳐진 하나의 리스트로 만들어줌
        :return:
        """
        doc = text
        sent_joined = [" ".join(x) for x in doc]
        doc_joined = [" ".join(sent_joined)]
        return doc_joined
    
    def get_corpus(self):
        """
        각 기사를 한 문장의 리스트로 만듦. 그것들을 연결해 tfidf를 진행할 코퍼스 생성
        """
        corpus = []
        for doc in self.docs_df:
            processed_text, _ = self.cooc(text=doc)
            # print(doc)
            x = self.doc2list(processed_text)[0]
            corpus.append(x)
        return corpus

    def tfidf_doc(self):
        """
        tfidf 값을 가지고 각 문서별 tfidf의 평균, 분산 값을 return 함
        tfidf_result 는 scipy 에서 제공하는 sparse matrix 형태(우리 cooc 형태와 비슷하다고 생각하면됨)
        로 데이터를 저장하고 있음. 아쉽게도 이건 axis 별 mean, sum 은 제공하는데 std 는 제공하지 않아서
        직접 구해줌
        """
        vectorizer = TfidfVectorizer()
        corpus = self.get_corpus()
        vectorizer.fit(corpus)
        tfidf_result = vectorizer.transform(corpus)
        docs_mean = np.asarray(tfidf_result.mean(axis=1)) # 문서별 tfidf 값의 평균. 즉 E(X)
        docs_mean = docs_mean.reshape(-1) # (1, 5) 에서 (5,) 로 바꿔줌
        
        docs_sq = np.asarray(tfidf_result.power(2).mean(axis=1)) # 문서별 tfidf 값을 element 별로 제곱해주고,
                                                                 # 평균값 내줌. 즉 E(X^2)
        docs_sq = docs_sq.squeeze()
        
        docs_std = docs_sq - docs_mean**2 # Variance = E(X^2) - E(X)^2
        
        return docs_mean, docs_std



## Measure (필요없는 부분 삭제)

In [0]:
class Measure():
    def __init__(self, graph):
        self.graph = graph

    def cal_Cent(self, g):
        # deg_cent = nx.algorithms.degree_centrality(g)
        # clo_cent = nx.algorithms.closeness_centrality(g)
        bet_cent = nx.algorithms.betweenness_centrality(g)
        # eig_cent = nx.algorithms.eigenvector_centrality_numpy(g)
        # info_cent = nx.algorithms.information_centrality(g)
        # list_deg = [k for k in deg_cent.values()]
        # self.list_deg = list_deg
        # list_clo = [k for k in clo_cent.values()]
        # self.list_clo = list_clo
        list_bet = [k for k in bet_cent.values()]
        self.list_bet = list_bet

        # return deg_cent

    def deg_GroupVal(self):
        """
        :param (list) list of values:
        Cd = Summation(Max - i th value) / (g-2)(g-1)
        :return: (float)  grouped value
        """
        X = 0
        max_val = max(self.list_deg)
        g = len(self.list_deg)
        gg = (g - 2) * (g - 1)
        print('maximum value of degree cetrality is : {0}'.format(max_val))
        for i in self.list_deg:
            tmp = max_val - i
            X += tmp
        result = X / gg
        print(result)
        return result

    def clo_GroupVal(self):
        """
        :param (list) list of values:
        Cd = Summation(Max - i th value) / ((g-2)(g-1)/(2g-3))
        :return: (float)  grouped value
        """
        X = 0
        max_val = max(self.list_clo)
        g = len(self.list_clo)
        gg = ((g - 2) * (g - 1)) / (2 * g - 3)
        print('maximum value of closeness cetrality is : {0}'.format(max_val))
        for i in self.list_clo:
            tmp = max_val - i
            X += tmp
        result = X / gg
        print(result)
        return result

    def bet_GroupVal(self):  ###########
        """
        :param (list) list of values:
        Cd = Summation(Max - i th value) / ((g-2)^2(g-1)/2)
        :return: (float)  grouped value
        """
        X = 0
        max_val = max(self.list_bet)
        g = len(self.list_bet)
        gg = (pow((g - 2), 2) * (g - 1)) / (2)
        # print('maximum value of betweenness cetrality is : {0}'.format(max_val))
        for i in self.list_bet:
            tmp = max_val - i
            X += tmp
        result = X / (gg + 1) # for zero division
        # print(result)
        return result
    
    def get_Value(self):
        """
        calculate all values and return it as a list
        :return: (float) values
        """
        # info = self.get_Info()
        start = self.cal_Cent(self.graph)
        # deg_val = self.deg_GroupVal()
        # clo_val = self.clo_GroupVal()
        bet_val = self.bet_GroupVal()

        return bet_val    

## Features.py (networkx 피쳐 일부 추가)(tfidf 함수 추가)

 * 현재 density, degree mean,var 추가
 * degree는 만들긴 했는데 기사마다 너무 달라서 안 쓰는게 더 좋을지도
  
 
 저 없을때 ti-cnn csv에서 말고 따로 피쳐 구현한게 있다면 추가해주세요
 
 

---

0909 진욱

* 인풋이 pd.Series로 구성된 기사들의 df 이므로 docs_df 라고 이름 지음. 그에 맞게 수정. 

* tfidf 함수추가

* label 없에고 원래 label 을 그대로 대치시킴


In [0]:
class Feature():
    def __init__(self, dataframe, doc_path_list):
        self.index = dataframe.index
        self.docs_df = dataframe['text']
        self.label = dataframe['type']
        self.doc_filenames = doc_path_list

    def cal_tfidf(self):
        model = CorTfidf(self.docs_df)
        tfidf_mean, tfidf_std = model.tfidf_doc()
        return tfidf_mean, tfidf_std

    def cal_edge_weight(self, matrix):
        wt_mean = np.mean(matrix['Weight'])
        wt_var = np.var(matrix['Weight'])
        return wt_mean, wt_var

    def cal_edge_num(self, matrix):
        return len(matrix['Linkage'])

    def cal_net_feature(self, G):
        net = Measure(G)
        bet_val = net.get_Value()
        
        deg_hist = nx.degree_histogram(G)
        deg_mean = 0
        for i in range(len(deg_hist)):
            deg_mean = deg_mean + i*deg_hist[i]
        deg_mean = deg_mean / np.sum(np.array(deg_hist))
        deg_var = 0
        for i in range(len(deg_hist)):
            deg_var = deg_var + ((i-deg_mean)**2)*deg_hist[i]
        deg_var = deg_var / np.sum(np.array(deg_hist))
                
        dens = nx.density(G)
        
        common_neighbors = [len(list(nx.common_neighbors(G, u, v))) for u, v in G.edges]
        com_mean = np.mean(np.array(common_neighbors))
        com_var = np.var(np.array(common_neighbors))
        degree_sequence = sorted([d for n, d in G.degree()], reverse=True)
        core_count = len([i for i in degree_sequence if i > np.quantile(degree_sequence, 0.75)])

        return deg_mean, deg_var, com_mean, com_var, core_count, bet_val, dens,

    def make_df(self, doc_path):
        net = Graph()
        G, matrix = net.create_graph(doc_path, string_to_list=True)  # 이미 tfidf_reweight.csv 로 된 애들을 만들어놔서 그걸로 시작해야함
        wt_mean, wt_var = self.cal_edge_weight(matrix)
        edge_num = self.cal_edge_num(matrix)
        deg_mean, deg_var, com_mean, com_var, core_count, bet_val, density = self.cal_net_feature(G)

        feature_df_one = {'wt_mean': wt_mean,
                          'wt_var': wt_var,
                          'edge_num': edge_num,
                          'degree_mean': deg_mean,
                          'degree_var': deg_var,
                          'com_mean': com_mean,
                          'com_var': com_var,
                          'core_count': core_count,
                          'betweeness': bet_val,
                          'density' : density,
                          }

        return feature_df_one

    def make_df_from_dataset(self):
        print("Make dataframe from dataset and cooc path..")
        print("Calculating tfidf for each doc")
        start = time.time()
        tfidf_mean, tfidf_std = self.cal_tfidf()
        end1 = time.time()
        print("Done")
        print("It took %d seconds" % (end1 - start))
        
        row_list = []
        for doc_path, doc_tf_mean, doc_tf_std in zip(self.doc_filenames, tfidf_mean, tfidf_std):
            feat_df_one = self.make_df(doc_path) # dict 형식의 문서 한개당 feat_df
            add_tfidf_feat = {'tfidf_mean': doc_tf_mean,
                             'tfidf_std': doc_tf_std}
            feat_df_one.update(add_tfidf_feat)
            row_list.append(feat_df_one)
        end2 = time.time()
        print("All features are calculated")
        
        print("It took %d seconds" % (end2 - end1))
        
        feature_df = pd.DataFrame(row_list, columns=row_list[0].keys(), index=self.index)
        feature_df['label'] = self.label
        print("Finished")
        return feature_df
#         return tfidf_mean, tfidf_std, feature_df

# 생성된 기사들과 원래 기사들의 label 맞추는 작업 필요함 (완료)
 

### os listdir 에서 순서 바꾸기 (완료)

In [0]:
def get_doc_filenames(document_path):
    """
    파일 이름 받기
    """
    return [os.path.join(document_path, each)
            for each in os.listdir(document_path)]

In [0]:
doc_path_list = get_doc_filenames('./cooc/')

In [0]:
len(doc_path_list)

In [0]:
print(doc_path_list)

['./cooc_mini/0.csv', './cooc_mini/11.csv', './cooc_mini/1.csv', './cooc_mini/15.csv', './cooc_mini/13.csv', './cooc_mini/17.csv', './cooc_mini/16.csv', './cooc_mini/12.csv', './cooc_mini/10.csv', './cooc_mini/14.csv', './cooc_mini/18.csv', './cooc_mini/19.csv', './cooc_mini/3.csv', './cooc_mini/7.csv', './cooc_mini/6.csv', './cooc_mini/4.csv', './cooc_mini/2.csv', './cooc_mini/8.csv', './cooc_mini/5.csv', './cooc_mini/9.csv']


In [0]:
import re
def sorted_aphanumeric(data):
    convert = lambda text: int(text) if text.isdigit() else text.lower()
    alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] 
    return sorted(data, key=alphanum_key)

In [0]:
print(sorted_aphanumeric(doc_path_list))

['./cooc/0.csv', './cooc/1.csv', './cooc/2.csv', './cooc/3.csv', './cooc/4.csv', './cooc/5.csv', './cooc/6.csv', './cooc/7.csv', './cooc/8.csv', './cooc/9.csv', './cooc/10.csv', './cooc/11.csv', './cooc/12.csv', './cooc/13.csv', './cooc/14.csv', './cooc/15.csv', './cooc/16.csv', './cooc/17.csv', './cooc/18.csv', './cooc/19.csv', './cooc/20.csv', './cooc/21.csv', './cooc/22.csv', './cooc/23.csv', './cooc/24.csv', './cooc/25.csv', './cooc/26.csv', './cooc/27.csv', './cooc/28.csv', './cooc/29.csv', './cooc/30.csv', './cooc/31.csv', './cooc/32.csv', './cooc/33.csv', './cooc/34.csv', './cooc/35.csv', './cooc/36.csv', './cooc/37.csv', './cooc/38.csv', './cooc/39.csv', './cooc/40.csv', './cooc/41.csv', './cooc/42.csv', './cooc/43.csv', './cooc/44.csv', './cooc/45.csv', './cooc/46.csv', './cooc/47.csv', './cooc/48.csv', './cooc/49.csv', './cooc/50.csv', './cooc/51.csv', './cooc/52.csv', './cooc/53.csv', './cooc/54.csv', './cooc/55.csv', './cooc/56.csv', './cooc/57.csv', './cooc/58.csv', './coo

# 실제 테스트

In [0]:
model = Feature(df, sorted_aphanumeric(doc_path_list))

In [0]:
feat_df = model.make_df_from_dataset()

Make dataframe from dataset and cooc path..
Calculating tfidf for each doc
Done
It took 3870 seconds


Make dataframe from dataset and cooc path..
Calculating tfidf for each doc
Done
It took 2750 seconds
HBox(children=(IntProgress(value=1, bar_style='info', description='Caculating other features', max=1, style=Pr…

Make dataframe from dataset and cooc path..\
Calculating tfidf for each doc\
Done\
It took 4 seconds

All features are calculated\
Finished

In [0]:
feat_df.head()

In [0]:
feat_df.to_csv('result.csv')

In [0]:
#0908 시작
#1014 인데 tfidf가 아직도 안끝났네...
1049
#tfidf 계산은 대략 30분 정도 걸리는것 같음

# 할일 - tfidf 가 생각보다 너무 오래걸림. 따로 저장해서 불러오는 식으로 해야할듯

이상한 점 -예전 과 런타임 실행 방식이 너무 다른 것 같음

10시 49분에 학교에서 돌아가는 걸 확인하고 
11시 15분에 집에서 다시 켜보니 런타임이 꺼져있음.

가능한 경우

1. 새로운 네트워크로 접속할 시 무조건 런타임이 재실행된다
2. 25분만에도 런타임이 꺼짐 
3. 내 아이디가 g.skku.edu 아이디라서 그럼

4. 현재 저 코드가 애초에 문제가있어서 저정도에서 돌리다보면 그냥 끝남