In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [4]:
## Sequential

model_seq = keras.Sequential()
model_seq.add(keras.Input(shape = (10, ), name = 'Inputlayer'))
model_seq.add(keras.layers.Dense(64, 'relu'))
model_seq.add(keras.layers.Dropout(0.2))
model_seq.add(keras.layers.Dense(32,'relu'))
model_seq.add(keras.layers.Dense(1,'sigmoid'))

model_seq.summary()

model_seq = keras.Sequential([
    keras.Input(shape = (10, ), name = 'Inputlayer'),
    keras.layers.Dense(64, 'relu'),
    keras.layers.Dense(1,'sigmoid')
])


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                704       
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 32)                2080      
                                                                 
 dense_2 (Dense)             (None, 1)                 33        
                                                                 
Total params: 2,817
Trainable params: 2,817
Non-trainable params: 0
_________________________________________________________________


In [6]:
## Functional API
inputs = keras.Input(shape = (10, ))
hidden1 = keras.layers.Dense(64, 'relu')(inputs)
dropout  = keras.layers.Dropout(0.2)(hidden1)
hidden2 = keras.layers.Dense(10, 'softmax')(dropout)

model_fun = keras.Model(inputs, hidden2, name = 'Functional_api')
model_fun.summary()

Model: "Functional_api"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 10)]              0         
                                                                 
 dense_5 (Dense)             (None, 64)                704       
                                                                 
 dropout_2 (Dropout)         (None, 64)                0         
                                                                 
 dense_6 (Dense)             (None, 10)                650       
                                                                 
Total params: 1,354
Trainable params: 1,354
Non-trainable params: 0
_________________________________________________________________


In [10]:
## Subclasing API

class Model_sub(keras.Model):
    def __init__(self, hidden, dropout, outputs):
        super(Model_sub, self).__init__()
        self.hidden = keras.layers.Dense(hidden, 'relu')
        self.dropout = keras.layers.Dropout(dropout)
        self.outputs = keras.layers.Dense(outputs, 'softmax')
        
    def call(self, inputs):
        x = self.hidden(inputs)
        x = self.dropout(x)
        x = self.outputs(x)
        return x
        
model_sub = Model_sub(64, 0.2, 10)

In [11]:
model_sub.build(input_shape=(1, 100))
model_sub.summary()

Model: "model_sub_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_13 (Dense)            multiple                  6464      
                                                                 
 dropout_6 (Dropout)         multiple                  0         
                                                                 
 dense_14 (Dense)            multiple                  650       
                                                                 
Total params: 7,114
Trainable params: 7,114
Non-trainable params: 0
_________________________________________________________________


In [13]:
## Sentimental analysis
samples = np.array(['너 오늘 이뻐 보인다',
          '나는 오늘 기분이 더러워',
          '끝내주는데, 좋은 일이 있나봐',
          '나 좋은 일이 생겼어',
          '아 오늘 진짜 짜증나',
          '환상적인데, 정말 좋은거 같아'])
targets = np.array([[1], [0], [1], [1], [0], [1]])

In [18]:
tokenizer = keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(samples)
# sequences = tokenizer.
tokenizer.word_counts
tokenizer.word_index

sequences = tokenizer.texts_to_sequences(samples)
input_seq = np.array(sequences)

list

In [20]:
batch_size = 2
epochs = 10
vocab_size = len(tokenizer.word_index) + 1
emb_size = 128
hidden1 = 256
hidden2 = 1

In [21]:
class Sentimental_analy(keras.Model):
    def __init__(self, vocab_size, emb_size, hidden1, hidden2):
        super(Sentimental_analy, self).__init__()
        self.embedding = keras.layers.Embedding(vocab_size, emb_size)
        self.dense = keras.layers.Dense(hidden1, 'relu')
        self.outputs = keras.layers.Dense(hidden2, 'sigmoid')
        
    def call(self, inputs):
        x = self.embedding(inputs)
        x = tf.reduce_mean(x, axis = 1)
        x = self.dense(x)
        x = self.outputs(x)
        return x
    
sentimental_analy = Sentimental_analy(vocab_size, emb_size, hidden1, hidden2)

In [None]:
sentimental_analy.compile()
sentimental_analy.fit()

In [22]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [24]:
text_data = ['나는 배가 고프다', 
             '내일 점심 뭐먹지', 
             '내일 공부 해야겠다', 
             '점심 먹고 공부 해야지']

In [31]:
tokenizer = keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(text_data)
tokenizer.word_index

text_data = ['나는 배가 고프다', 
             '내일 점심 뭐먹지', 
             '내일 공부 해야겠다', 
             '점심 먹고 공부 해야지']

countVectorizer = CountVectorizer()
countVectorizer.fit(text_data)
Count_vector = countVectorizer.transform(text_data)
DTM = Count_vector.toarray()
print(DTM)
print('='*50)
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(text_data)
Tfidf_vector = tfidf_vectorizer.transform(text_data).toarray()
TFIDF_matrix = Tfidf_vector
print(TFIDF_matrix)

[[1 0 1 0 0 0 1 0 0 0]
 [0 0 0 1 0 1 0 1 0 0]
 [0 1 0 1 0 0 0 0 1 0]
 [0 1 0 0 1 0 0 1 0 1]]
[[0.57735027 0.         0.57735027 0.         0.         0.
  0.57735027 0.         0.         0.        ]
 [0.         0.         0.         0.52640543 0.         0.66767854
  0.         0.52640543 0.         0.        ]
 [0.         0.52640543 0.         0.52640543 0.         0.
  0.         0.         0.66767854 0.        ]
 [0.         0.43779123 0.         0.         0.55528266 0.
  0.         0.43779123 0.         0.55528266]]


In [34]:
## English tokenizer
# ! pip install nltk
import nltk

In [36]:
nltk.download('all-corpora')
nltk.download('punkt')

[nltk_data] Downloading collection 'all-corpora'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\ETV\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\abc.zip.
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\ETV\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\alpino.zip.
[nltk_data]    | Downloading package bcp47 to
[nltk_data]    |     C:\Users\ETV\AppData\Roaming\nltk_data...
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     C:\Users\ETV\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\biocreative_ppi.zip.
[nltk_data]    | Downloading package brown to
[nltk_data]    |     C:\Users\ETV\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\brown.zip.
[nltk_data]    | Downloading package brown_tei to
[nltk_data]    |     C:\Users\ETV\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\brown_tei.zip.
[nltk_data]  

True

In [46]:
sentence = "Natural language processing (NLP) is a subfield of computer science, \
information engineering, and artificial intelligence concerned \
with the interactions between computers and human (natural) languages, \
in particular how to program computers to process and analyze \
large amounts of natural language data."

paragraph = "Natural language processing (NLP) is a subfield of computer science, \
    information engineering, and artificial intelligence concerned with the \
    interactions between computers and human (natural) languages, \
    in particular how to program computers to process and analyze \
    large amounts of natural language data. \
    Challenges in natural language processing frequently involve speech recognition, \
    natural language understanding, and natural language generation."

print(sentence)

Natural language processing (NLP) is a subfield of computer science, information engineering, and artificial intelligence concerned with the interactions between computers and human (natural) languages, in particular how to program computers to process and analyze large amounts of natural language data.


In [38]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [48]:
nltk_word = word_tokenize(sentence)
print(nltk_word)
# print(sentence.split())
nltk_sent = sent_tokenize(paragraph)
display(nltk_sent)
len(nltk_sent)

['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'subfield', 'of', 'computer', 'science', ',', 'information', 'engineering', ',', 'and', 'artificial', 'intelligence', 'concerned', 'with', 'the', 'interactions', 'between', 'computers', 'and', 'human', '(', 'natural', ')', 'languages', ',', 'in', 'particular', 'how', 'to', 'program', 'computers', 'to', 'process', 'and', 'analyze', 'large', 'amounts', 'of', 'natural', 'language', 'data', '.']


['Natural language processing (NLP) is a subfield of computer science,     information engineering, and artificial intelligence concerned with the     interactions between computers and human (natural) languages,     in particular how to program computers to process and analyze     large amounts of natural language data.',
 'Challenges in natural language processing frequently involve speech recognition,     natural language understanding, and natural language generation.']

2

In [2]:
! java -version

java version "1.8.0_381"
Java(TM) SE Runtime Environment (build 1.8.0_381-b09)
Java HotSpot(TM) 64-Bit Server VM (build 25.381-b09, mixed mode)


In [3]:
import sys
sys.version

'3.8.16 (default, Jun 12 2023, 21:00:42) [MSC v.1916 64 bit (AMD64)]'

In [4]:
! pip install ./JPype1-1.4.0-cp38-cp38-win_amd64.whl

Processing c:\users\etv\nbkim\onedrive\documents\lecture_2019\academy\6_nlp\tensorflow-ml-nlp-tf2-master\2.nlp_prep\jpype1-1.4.0-cp38-cp38-win_amd64.whl
Installing collected packages: JPype1
Successfully installed JPype1-1.4.0


In [5]:
! pip install konlpy

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
                                              0.0/19.4 MB ? eta -:--:--
                                              0.0/19.4 MB ? eta -:--:--
                                              0.0/19.4 MB ? eta -:--:--
                                             0.0/19.4 MB 219.4 kB/s eta 0:01:29
                                             0.1/19.4 MB 438.1 kB/s eta 0:00:45
     -                                        0.9/19.4 MB 3.9 MB/s eta 0:00:05
     ------------                             6.2/19.4 MB 22.0 MB/s eta 0:00:01
     ----------------------                 11.5/19.4 MB 108.8 MB/s eta 0:00:01
     ----------------------------            14.0/19.4 MB 93.0 MB/s eta 0:00:01
     --------------------------------------  19.3/19.4 MB 93.9 MB/s eta 0:00:01
     --------------------------------------  19.4/19.4 MB 93.9 MB/s eta 0:00:01
     --------------------------------------- 19.4/19.4 MB 50.4 MB/s e

In [7]:
from konlpy.tag import Okt

In [8]:
okt = Okt()

In [18]:
text = "한글 자연어 처리는 재밌다 이제부터 열심히 해야ㅎㅎㅎ"
print(text.split())
print(okt.nouns(text))
print(okt.morphs(text))
print(okt.morphs(text, stem = True))
print(okt.phrases(text))
print(okt.pos(text))

['한글', '자연어', '처리는', '재밌다', '이제부터', '열심히', '해야ㅎㅎㅎ']
['한글', '자연어', '처리', '이제']
['한글', '자연어', '처리', '는', '재밌다', '이제', '부터', '열심히', '해야', 'ㅎㅎㅎ']
['한글', '자연어', '처리', '는', '재밌다', '이제', '부터', '열심히', '하다', 'ㅎㅎㅎ']
['한글', '한글 자연어', '한글 자연어 처리', '이제', '자연어', '처리']
[('한글', 'Noun'), ('자연어', 'Noun'), ('처리', 'Noun'), ('는', 'Josa'), ('재밌다', 'Adjective'), ('이제', 'Noun'), ('부터', 'Josa'), ('열심히', 'Adverb'), ('해야', 'Verb'), ('ㅎㅎㅎ', 'KoreanParticle')]


In [19]:
from konlpy.corpus import kolaw, kobill

In [24]:
# print(kolaw.open('constitution.txt').read())
# 1809890.txt - 1809899.txt
print(kobill.open('1809890.txt').read())

지방공무원법 일부개정법률안

(정의화의원 대표발의 )

 의 안
 번 호

9890

발의연월일 : 2010.  11.  12.  

발  의  자 : 정의화․이명수․김을동 

이사철․여상규․안규백

황영철․박영아․김정훈

김학송 의원(10인)

제안이유 및 주요내용

  초등학교 저학년의 경우에도 부모의 따뜻한 사랑과 보살핌이 필요

한 나이이나, 현재 공무원이 자녀를 양육하기 위하여 육아휴직을 할 

수 있는 자녀의 나이는 만 6세 이하로 되어 있어 초등학교 저학년인 

자녀를 돌보기 위해서는 해당 부모님은 일자리를 그만 두어야 하고 

이는 곧 출산의욕을 저하시키는 문제로 이어질 수 있을 것임.

  따라서 육아휴직이 가능한 자녀의 연령을 만 8세 이하로 개정하려

는 것임(안 제63조제2항제4호).

- 1 -

법률  제        호

지방공무원법 일부개정법률안

지방공무원법 일부를 다음과 같이 개정한다.

제63조제2항제4호 중 “만 6세 이하의 초등학교 취학 전 자녀를”을 “만 

8세 이하(취학 중인 경우에는 초등학교 2학년 이하를 말한다)의 자녀를”

로 한다.

부      칙

이 법은 공포한 날부터 시행한다.

- 3 -

신 ·구조문대비표

현      행

개   정   안

제63조(휴직) ① (생  략)

제63조(휴직) ① (현행과 같음)

  ② 공무원이 다음 각 호의 어

  ② -------------------------

느 하나에 해당하는 사유로 휴

----------------------------

직을 원하면 임용권자는 휴직

----------------------------

을 명할 수 있다. 다만, 제4호

-------------.---------------

의 경우에는 대통령령으로 정

----------------------------

하는 특별한 사정이 없으면 휴

----------------------------

직을 명하여야 한다.

--------------.

  1. ∼ 3.

In [30]:
## python string 함수
a = ' Natural language  '
# print(a)
# a
a.count('a')
a.find('a')
a.find('al')
a.find('w')

-1

In [35]:
print(','.join(a))
print(','.join('kkkk'))
print(','.join(['kkk', 'llll']))
print(' '.join(['kkk', 'llll'])) #

 ,N,a,t,u,r,a,l, ,l,a,n,g,u,a,g,e, , 
k,k,k,k
kkk,llll
kkk llll


In [37]:
a.upper()
a.lower() #

' natural language  '

In [41]:
a.strip() #
a.lstrip()
a.rstrip()

' Natural language'

In [50]:
## 
print(a)
a.replace('^', ' ') #
a.split() #


 Natural language  


['Natural', 'language']

In [57]:
print('I eat %s apples a day' % 3)
numbers = 3
print('I eat {} apples a day'.format(3))
print(f'I eat {numbers} apples a day')


I eat 3 apples a day
I eat 3 apples a day
I eat 3 apples a day


In [58]:
import re

In [61]:
## 메타문자, '.' 줄바꿈 문자인 \n을 제외한 모든 문자와 매치됨을 의미한다.
print(re.search('ab', 'aababc'))
print(re.search('a.b', 'aababc'))
print(re.search('a.b', 'axbabc'))


<re.Match object; span=(1, 3), match='ab'>
<re.Match object; span=(0, 3), match='aab'>
<re.Match object; span=(0, 3), match='axb'>


In [64]:
ind = re.search('a.b', 'axbawbc')
print(ind)
print(ind.start())
print(ind.end())


<re.Match object; span=(0, 3), match='axb'>
0
3


In [67]:
# [.]
print(re.search('a[.]b', 'aabccab'))
print(re.search('a[.]b', 'aabcca.b'))


None
<re.Match object; span=(5, 8), match='a.b'>


In [71]:
# '*' , 반복을 의미하는 * 메타 문자, 앞에 있는 문자 a가 0부터 무한대로 반복
print(re.search('a*b', 'aababc'))
print(re.search('a*b', 'aaaaababc'))
print(re.search('a*b', 'babc'))
print(re.search('a*b', 'ababc'))

<re.Match object; span=(0, 3), match='aab'>
<re.Match object; span=(0, 6), match='aaaaab'>
<re.Match object; span=(0, 1), match='b'>
<re.Match object; span=(0, 2), match='ab'>


In [75]:
# '+', 반복을 의미하는 + 메타 문자, 앞에 있는 문자 a가 1부터 무한대로 반복
print(re.search('a+b', 'aababc'))
print(re.search('a+b', 'aaaaababc'))
print(re.search('a+b', 'baaaaaaaaaaaaaaaaaabc'))


<re.Match object; span=(0, 3), match='aab'>
<re.Match object; span=(0, 6), match='aaaaab'>
<re.Match object; span=(1, 20), match='aaaaaaaaaaaaaaaaaab'>


In [76]:
# '?' , 반복을 의미하는 ? 메타 문자, 앞에 있는 문자 a가 0 또는 1부터 반복

print(re.search('a?b', 'aababc'))
print(re.search('a?b', 'aaaaababc'))
print(re.search('a?b', 'babc'))
print(re.search('a?b', 'ababc'))

<re.Match object; span=(1, 3), match='ab'>
<re.Match object; span=(4, 6), match='ab'>
<re.Match object; span=(0, 1), match='b'>
<re.Match object; span=(0, 2), match='ab'>


In [87]:
# 반복 (a{3,5}b, ?), a{4}b

# print(re.search('a{2}b', 'aababc'))
# print(re.search('a{2}b', 'abaabc'))
# print(re.search('a{2}b', 'ababc'))

print(re.search('a{3,5}b', 'aababc'))
print(re.search('a{3,5}b', 'abaabc'))
print(re.search('a{3,5}b', 'ababc'))
print(re.search('a{3,5}b', 'abaaaaabc'))

None
None
None
<re.Match object; span=(2, 8), match='aaaaab'>


In [96]:
# [abc], search
p = re.compile('[a-z]+')
print(p.search(' python'))
print(p.match('python'))
print(p.match('python').start())
print(p.match('python').end())
print(p.match('python').group())



<re.Match object; span=(1, 7), match='python'>
<re.Match object; span=(0, 6), match='python'>
0
6
python


In [99]:
p = re.compile('sk*t')
m = p.match('string goes here')

if m:
    print('Matched here', m.group())
else:
    print('No matched here')

Matched here st


In [100]:
p = re.compile('[a-z]+')
print(p.match('4 python'))
print(p.search('4 python hello pandas'))

None
<re.Match object; span=(2, 8), match='python'>


In [105]:
## findall
p = re.compile('[a-z]+')
# p.findall('You look only once YOLO')
print(p.findall('life is too short'))
print('life is too short'.split())

['life', 'is', 'too', 'short']
['life', 'is', 'too', 'short']


In [107]:
result = p.finditer('life is too short')

for i in result:
    print(i)

<re.Match object; span=(0, 4), match='life'>
<re.Match object; span=(5, 7), match='is'>
<re.Match object; span=(8, 11), match='too'>
<re.Match object; span=(12, 17), match='short'>


In [111]:
p = re.compile('[a-z]+', re.I)
# p = re.compile('[a-zA-Z]+')

print(p.search('python'))
print(p.search('Python'))


<re.Match object; span=(0, 6), match='python'>
<re.Match object; span=(0, 6), match='Python'>


In [113]:
##
p = re.compile('^python\s\w+', re.MULTILINE)
data = '''python one
life is too short
python two
you need python
python three
'''
p.findall(data)

['python one', 'python two', 'python three']

In [118]:
p = re.compile('python|Hello')
p.match('python and hello')
print(p.search('Hello and python'))
print(p.search('python and Hello '))

<re.Match object; span=(0, 5), match='Hello'>
<re.Match object; span=(0, 6), match='python'>


In [121]:
print(re.search('Life', 'Life is too short'))
print(re.search('Life', 'Short is my Life'))


<re.Match object; span=(0, 4), match='Life'>
<re.Match object; span=(12, 16), match='Life'>


In [123]:
## ^ 문자의 시작, $ 문장의 끝
print(re.search('Life', 'Life is too short'))
print(re.search('Life', 'Too short is Life'))

print(re.search('^Life', 'Life is too short'))
print(re.search('^Life', 'Too short is Life'))

<re.Match object; span=(0, 4), match='Life'>
<re.Match object; span=(13, 17), match='Life'>
<re.Match object; span=(0, 4), match='Life'>
None


In [124]:
print(re.search('Life', 'Life is too short'))
print(re.search('Life', 'Too short is Life'))

print(re.search('Life$', 'Life is too short'))
print(re.search('Life$', 'Too short is Life'))

<re.Match object; span=(0, 4), match='Life'>
<re.Match object; span=(13, 17), match='Life'>
None
<re.Match object; span=(13, 17), match='Life'>


In [126]:
#  *, ?, +
p = re.compile('(ABC)+')
p.search('ABCABCABC OK?')

<re.Match object; span=(0, 9), match='ABCABCABC'>

In [137]:
text = "park 010-1234-1234"
text1 = "park 01012341234"

p = re.compile('\d{3}[-]?\d{4}[-]?\d{4}')
print(p.search(text))
print(p.search(text1))


<re.Match object; span=(5, 18), match='010-1234-1234'>
<re.Match object; span=(5, 16), match='01012341234'>


In [None]:
text = "park 010-1234-1234"
text1 = "park 01012341234"

p = re.compile('\d+[-]\d+[-]\d')


In [138]:
import kaggle

In [139]:
! kaggle competitions list

ref                                                                                           deadline             category            reward  teamCount  userHasEntered  
--------------------------------------------------------------------------------------------  -------------------  ---------------  ---------  ---------  --------------  
https://www.kaggle.com/competitions/asl-fingerspelling                                        2023-08-24 23:59:00  Research          $200,000        805           False  
https://www.kaggle.com/competitions/icr-identify-age-related-conditions                       2023-08-10 23:59:00  Featured           $60,000       5870           False  
https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries                      2023-10-11 23:59:00  Featured           $60,000        284           False  
https://www.kaggle.com/competitions/bengaliai-speech                                          2023-10-17 23:59:00  Research           $53,000    

In [140]:
!kaggle competitions download -c word2vec-nlp-tutorial

word2vec-nlp-tutorial.zip: Skipping, found more recently modified local copy (use --force to force download)
