In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import json
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/friends3/friends_train3.json
/kaggle/input/friends3/friends_test3.json
/kaggle/input/friends3/friends_dev3.json
/kaggle/input/dfc615e/en_data.csv
/kaggle/input/dfc615e/en_sample.csv


In [2]:
data = pd.read_csv("/kaggle/input/dfc615e/en_data.csv", sep=",", encoding="ms949")
sample = pd.read_csv("/kaggle/input/dfc615e/en_sample.csv", sep=",", encoding="ms949")
dev_file = open('/kaggle/input/friends3/friends_test3.json', encoding="utf-8")
dev_data = json.load(dev_file)
train_file = open('/kaggle/input/friends3/friends_train3.json', encoding="utf-8")
train_data = json.load(train_file)
test_file = open('/kaggle/input/friends3/friends_dev3.json', encoding="utf-8")
test_data = json.load(test_file)

In [3]:
data.head()

Unnamed: 0,id,i_dialog,i_utterance,speaker,utterance
0,0,0,0,Phoebe,"Alright, whadyou do with him?"
1,1,0,1,Monica,Oh! You're awake!
2,2,0,2,Joey,Then you gotta come clean with Ma! This is not...
3,3,0,3,Mr. Tribbiani,"Yeah, but this is"
4,4,0,4,Joey,I don't wanna hear it! Now go to my room!


In [4]:
df_dev = pd.DataFrame(columns=['annotation', 'emotion', 'speaker', 'utterance'])
df_train = pd.DataFrame(columns=['annotation', 'emotion', 'speaker', 'utterance'])
df_test = pd.DataFrame(columns=['annotation', 'emotion', 'speaker', 'utterance'])

for i in range(len(dev_data)):
    df_dev = pd.concat([df_dev, pd.DataFrame(dev_data[i])])

for i in range(len(train_data)):
    df_train = pd.concat([df_train, pd.DataFrame(train_data[i])])

for i in range(len(test_data)):
    df_test = pd.concat([df_test, pd.DataFrame(test_data[i])])

df_dev = df_dev.reset_index(drop=True)
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
df_data = data.sort_values("id").reset_index(drop=True)

In [5]:
len(df_dev), len(df_train), len(df_test), len(df_data)

(2764, 10561, 1178, 3296)

In [6]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# 랭커스터 스태머의 사용
from nltk.stem.lancaster import LancasterStemmer
lancaster_stemmer = LancasterStemmer()

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')

# Lemmatization 음소표기법
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

In [7]:
stops = set(stopwords.words('english'))

In [8]:
df_dev['words'] = ''
df_train['words'] = ''
df_test['words'] = ''
df_data['words'] = ''

len(df_dev), len(df_train), len(df_test), len(df_data)

(2764, 10561, 1178, 3296)

In [9]:
def comment_to_words(data):
    # 1. 영어가 아닌 문자는 공백으로 변환
    data = re.sub('[^a-zA-Z]', ' ', data)
    
    # 2. 소문자로 변환
    lowerdata = data.lower()
    
    # 3. 문자열로 변환
    words = lowerdata.split()
    
    # 4. 불용어 제거
    words = [w for w in words if not w in stops]
    
    # 5. 어간추출
    stemming_words = [stemmer.stem(w) for w in words]
    
    # 7. 공백으로 구분된 문자열로 결합하여 결과를 반환
    words = ' '.join(stemming_words)
    return words

In [10]:
# 개발 데이터셋 전처리
for i in range(0, len(df_dev)):
    df_dev.loc[i, 'words'] = comment_to_words(df_dev.loc[i, 'utterance'])

# 훈련 데이터셋 전처리
for i in range(0, len(df_train)):
    df_train.loc[i, 'words'] = comment_to_words(df_train.loc[i, 'utterance'])    

# 테스트 데이터셋 전처리
for i in range(0, len(df_test)):
    df_test.loc[i, 'words'] = comment_to_words(df_test.loc[i, 'utterance'])    
    
# 테스트 데이터셋 전처리
for i in range(0, len(df_test)):
    df_data.loc[i, 'words'] = comment_to_words(df_data.loc[i, 'utterance'])    

In [11]:
y_info = [['neutral', 0],
          ['surprise', 1],
          ['non-neutral', 2],
          ['joy', 3],
          ['sadness', 4],
          ['anger', 5],
          ['disgust', 6]]

y_info = [['neutral', 'neutral'],
          ['surprise', 'surprise'],
          ['non-neutral', 'non-neutral'],
          ['joy', 'joy'],
          ['sadness', 'sadness'],
          ['anger', 'anger'],
          ['disgust', 'disgust']]


In [12]:
df_y = pd.DataFrame(y_info, columns=['emotion', 'Y'])

In [13]:
df_dev = pd.merge(df_dev, df_y, on=['emotion'])
df_train = pd.merge(df_train, df_y, on=['emotion'])
df_test = pd.merge(df_test, df_y, on=['emotion'])

In [14]:
df_train.head()

Unnamed: 0,annotation,emotion,speaker,utterance,words,Y
0,4100000,neutral,Chandler,also I was the point person on my companys tr...,also point person compani transit kl gr system,neutral
1,5000000,neutral,The Interviewer,You mustve had your hands full.,must hand full,neutral
2,5000000,neutral,Chandler,That I did. That I did.,,neutral
3,5000000,neutral,The Interviewer,So lets talk a little bit about your duties.,let talk littl bit duti,neutral
4,5000000,neutral,The Interviewer,"Now youll be heading a whole division, so you...",head whole divis lot duti,neutral


In [15]:
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras import losses
from tensorflow.keras import metrics

from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras import backend as K
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

import nltk
nltk.download('stopwords')

Using TensorFlow backend.


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection  import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline



In [17]:
pipeline = Pipeline([
    ('vect', TfidfVectorizer(stop_words='english')),
    ('clf', LogisticRegression())
])

parameters = {
    'vect__max_df': (0.25, 0.5),
    'vect__ngram_range': ((1, 1), (1, 2)),
    'vect__use_idf': (True, False),
    'clf__C': (0.1, 1, 10),
}

In [18]:
len(df_train['words']), len(df_train['Y']), len(df_test['words']), len(df_test['Y']), 

(10376, 10376, 1149, 1149)

In [19]:
#X_train,  _, y_train, _ = train_test_split(df_train['words'], df_train['Y'], train_size=0.9999)
X_train,  _, y_train, _ = train_test_split(df_train['utterance'], df_train['Y'], train_size=0.9999)


In [20]:
X_data = df_data['words']
X_train[:5]

6566                                                Okay!
6186    Chandler, what kind of an idiot do you take me...
8602                                                 Yay!
9571                            You missed youre chance!
4446                                                 Hey.
Name: utterance, dtype: object

In [21]:
X_test, _, y_test, _ = train_test_split(df_test['utterance'], df_test['Y'], train_size=0.9999)
len(X_train), len(y_train), len(X_test), len(y_test)

(10374, 10374, 1148, 1148)

In [22]:
grid_search = GridSearchCV(pipeline, parameters, n_jobs=4, verbose=1, scoring='accuracy')
grid_search.fit(X_train, y_train)
print('best score: %0.3f' % grid_search.best_score_)
print('best parameters set:')
best_parameters = grid_search.best_estimator_.get_params()

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   22.1s
[Parallel(n_jobs=4)]: Done 120 out of 120 | elapsed:  1.6min finished


best score: 0.502
best parameters set:


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [23]:
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print('\t %s: %r' % (param_name, best_parameters[param_name]))

	 clf__C: 1
	 vect__max_df: 0.25
	 vect__ngram_range: (1, 1)
	 vect__use_idf: False


In [24]:
predictions = grid_search.predict(X_test)
print('Accuracy:', accuracy_score(y_test, predictions))
print('Confusion Matrix:', confusion_matrix(y_test, predictions))
print('*************************************************************')
print('Classification Report:')
print(classification_report(y_test, predictions,target_names=df_y['emotion']))

Accuracy: 0.460801393728223
Confusion Matrix: [[  3   0   5  65  11   0   1]
 [  0   0   0  17   5   0   1]
 [  0   0  24  88   3   0   8]
 [  0   0  15 434  29   0  13]
 [  1   0   3 159  24   2  25]
 [  0   0   0  44   8   6   4]
 [  0   0   3  92  16   1  38]]
*************************************************************
Classification Report:
              precision    recall  f1-score   support

     neutral       0.75      0.04      0.07        85
    surprise       0.00      0.00      0.00        23
 non-neutral       0.48      0.20      0.28       123
         joy       0.48      0.88      0.62       491
     sadness       0.25      0.11      0.15       214
       anger       0.67      0.10      0.17        62
     disgust       0.42      0.25      0.32       150

    accuracy                           0.46      1148
   macro avg       0.44      0.23      0.23      1148
weighted avg       0.45      0.46      0.38      1148



  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
results = grid_search.predict(X_data)

In [26]:
sample['Predicted'] = results

In [27]:
len(sample), len(results)

(3296, 3296)

In [28]:
sample['Predicted'].value_counts()

neutral        3108
non-neutral      68
surprise         66
joy              47
anger             4
disgust           2
sadness           1
Name: Predicted, dtype: int64

In [29]:
sample[['Id', 'Predicted']].to_csv("/kaggle/working/output2_logisticRegrsion.csv", sep=",", encoding="ms949", index=False)

In [None]:
완료!!!

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')

features = tfidf.fit_transform(df_train['utterance']).toarray()
labels = df_train['Y']
features.shape

(10376, 1316)

In [31]:
df_train.head()

Unnamed: 0,annotation,emotion,speaker,utterance,words,Y
0,4100000,neutral,Chandler,also I was the point person on my companys tr...,also point person compani transit kl gr system,neutral
1,5000000,neutral,The Interviewer,You mustve had your hands full.,must hand full,neutral
2,5000000,neutral,Chandler,That I did. That I did.,,neutral
3,5000000,neutral,The Interviewer,So lets talk a little bit about your duties.,let talk littl bit duti,neutral
4,5000000,neutral,The Interviewer,"Now youll be heading a whole division, so you...",head whole divis lot duti,neutral


In [32]:
from io import StringIO

df_train['words'] = df_train[pd.notnull(df_train['utterance'])]
df_train['Y2'] = df_train['Y'].factorize()[0]
category_to_id = dict(df_train[['words', 'Y2']].values)
id_to_category = dict(df_train[[ 'Y2', 'words']].values)

In [33]:
from sklearn.feature_selection import chi2
import numpy as np

N = 2

for Product, category_id in sorted(category_to_id.items()):
    features_chi2 = chi2(features, labels == category_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("# '{}':".format(Product))
    print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
    print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))

# '0000005':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half hour
. happy place
# '0000014':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half hour
. happy place
# '0000023':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half hour
. happy place
# '0000032':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half hour
. happy place
# '0000041':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half hour
. happy place
# '0000050':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half hour
. happy place
# '0000104':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half hour
. happy place
# '0000113':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half hour
. happy place
# '0000122':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half ho

# '0020003':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half hour
. happy place
# '0020012':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half hour
. happy place
# '0020021':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half hour
. happy place
# '0020030':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half hour
. happy place
# '0020102':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half hour
. happy place
# '0020111':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half hour
. happy place
# '0020120':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half hour
. happy place
# '0020201':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half hour
. happy place
# '0020210':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half ho

# '0200012':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half hour
. happy place
# '0200021':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half hour
. happy place
# '0200030':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half hour
. happy place
# '0200102':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half hour
. happy place
# '0200111':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half hour
. happy place
# '0200120':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half hour
. happy place
# '0200201':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half hour
. happy place
# '0200210':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half hour
. happy place
# '0200300':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half ho

# '1010102':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half hour
. happy place
# '1010111':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half hour
. happy place
# '1010120':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half hour
. happy place
# '1010201':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half hour
. happy place
# '1010210':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half hour
. happy place
# '1010300':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half hour
. happy place
# '1011002':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half hour
. happy place
# '1011011':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half hour
. happy place
# '1011020':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half ho

# '1300010':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half hour
. happy place
# '1300100':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half hour
. happy place
# '1301000':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half hour
. happy place
# '1310000':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half hour
. happy place
# '1400000':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half hour
. happy place
# '2000003':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half hour
. happy place
# '2000012':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half hour
. happy place
# '2000021':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half hour
. happy place
# '2000030':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half ho

# '3100001':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half hour
. happy place
# '3100010':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half hour
. happy place
# '3100100':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half hour
. happy place
# '3101000':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half hour
. happy place
# '3110000':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half hour
. happy place
# '3200000':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half hour
. happy place
# '4000001':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half hour
. happy place
# '4000010':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half hour
. happy place
# '4000100':
  . Most correlated unigrams:
. half
. young
  . Most correlated bigrams:
. half ho

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(df.Consumer_complaint_narrative).toarray()
labels = df.category_id
features.shape


NameError: name 'df' is not defined