In [1]:
import sklearn
import numpy as np


# Import all of the scikit learn stuff
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.cluster import KMeans, MiniBatchKMeans
import pandas as pd
import warnings
# Suppress warnings from pandas library
warnings.filterwarnings("ignore", category=DeprecationWarning,
module="pandas", lineno=570)

# -*- coding: utf-8 -*-
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import Text
from nltk import FreqDist
import os

## Data Preprocessing

def preprocessing(text):
    # 1.Tokenize into words
    tokens = [word for sent in nltk.sent_tokenize(text)
              for word in nltk.word_tokenize(sent)]

    # 2.Cleaning : Remove words less than three letters & Lower capitalization & Remove Numbers
    tokens = [word.lower() for word in tokens if len(word) >= 3  if not word.isdigit()]
    
    # 3.Remove stopwords
    stop = stopwords.words('english')
    stop.extend(["n't","'re","i'm","'ve",'...'])
    tokens = [token for token in tokens if token not in stop]
    
    # 4.Lemmatization
    lmtzr = WordNetLemmatizer()
    tokens = [lmtzr.lemmatize(word) for word in tokens]
    tokens = [lmtzr.lemmatize(word, 'v') for word in tokens]

    return tokens


## Pos Tagging and estract

def extractNouns(tokens):
    tagged_list = pos_tag(tokens)
    nouns_list = [t[0] for t in tagged_list if t[1] == "NN" or t[1] == "NNP"]
    return nouns_list

def extractVerbs(tokens):
    tagged_list = pos_tag(tokens)
    verbs_list = [t[0] for t in tagged_list if t[1] == "VB"]
    return nouns_list


#nltk.download()

## Indexing

## Word Indexing & Preprocessing in all the in data dir


def indexingScripts() :
    path_genre = './data/'
    genre_list = os.listdir(path_genre)
    total_freq = []
    for i in genre_list[:]:
        path_file = path_genre+i+'/'
        file_list = os.listdir(path_file)
        data = ""
        for j in file_list[:]:
            if not (j[-4:] == ".txt") :
                continue
            print("Indexing... ",j)
            f = open(path_file+j,'rt', encoding='utf-8')
            data = data + f.read()
        print("processing... %s" % i)
        preprocessed = preprocessing(data)
        fdist=FreqDist(preprocessed)
        freq = [i, fdist.most_common(100)]
        total_freq += [freq]
        print(freq,'\n')
    
    return total_freq

def indexingTestFiles(path_input) :
    file_list = os.listdir(path_input) 
    ret_list = [] 
    for f in file_list[:] :
        if not (f[-4:] == ".txt") :
            continue

        t = open(path_input+"/"+f, 'rt', encoding = 'utf-8') 
        text = t.read()
        preprocessed = preprocessing(text)
        fdist = FreqDist(preprocessed)
        # 100개 단어를 추린다. 이거 바꿔볼만 한듯
        freq = [ f[:-4] , fdist.most_common(100)]
        ret_list.append(freq)
    return ret_list 



def queryProcessing(text,name) :
    data = text
    preprocessed = preprocessing(data)
    fdist = FreqDist(preprocessed)

    freq = [name, fdist.most_common(100)]
    return freq



In [2]:
# save each genre's file path
actions = [] 
Adventures = []
Animations = []
Comedys = []
Crimes = []
Dramas = []
Familys = []
Fantasys = []
Horrors = []
Mysterys = []
Romances = []
Sci_Fis = []
Thrillers = []
movies = [actions,Adventures,Animations,Comedys,Crimes,Dramas,Familys,
          Fantasys,Horrors,Mysterys,Romances,Sci_Fis,Thrillers]

# save movie's contents.

action_content = []
Adventure_content = []
Animation_content = []
Comedy_content = []
Crime_content = []
Drama_content = []
Family_content = []
Fantasy_content = []
Horror_content = []
Mystery_content = []
Romance_content = []
Sci_Fi_content = []
Thriller_content = []
movies_content = [action_content,Adventure_content,Animation_content,Comedy_content,Crime_content,Drama_content,
                 Family_content,Fantasy_content,Horror_content,Mystery_content,Romance_content,
                    Sci_Fi_content,Thriller_content]



In [3]:
from __future__ import print_function

# save each genre's file path
for root, dirs, files in os.walk('/Users/simdaebeom/Desktop/nlp_t2m-master/data/action'):
    for fname in files:
        action = os.path.join(root, fname)
        if action.endswith(".txt"): #'*'은 모든 값을 의미
            actions.append(action)
for root, dirs, files in os.walk('/Users/simdaebeom/Desktop/nlp_t2m-master/data/Adventure'):
    for fname in files:
        Adventure = os.path.join(root, fname)
        if Adventure.endswith(".txt"): #'*'은 모든 값을 의미
            Adventures.append(Adventure)
for root, dirs, files in os.walk('/Users/simdaebeom/Desktop/nlp_t2m-master/data/Animation'):
    for fname in files:
        Animation = os.path.join(root, fname)
        if Animation.endswith(".txt"): #'*'은 모든 값을 의미
            Animations.append(Animation)
for root, dirs, files in os.walk('/Users/simdaebeom/Desktop/nlp_t2m-master/data/Comedy'):
    for fname in files:
        Comedy = os.path.join(root, fname)
        if Comedy.endswith(".txt"): #'*'은 모든 값을 의미
            Comedys.append(Comedy)
for root, dirs, files in os.walk('/Users/simdaebeom/Desktop/nlp_t2m-master/data/Crime'):
    for fname in files:
        Crime = os.path.join(root, fname)
        if Crime.endswith(".txt"): #'*'은 모든 값을 의미
            Crimes.append(Crime)
for root, dirs, files in os.walk('/Users/simdaebeom/Desktop/nlp_t2m-master/data/Drama'):
    for fname in files:
        Drama = os.path.join(root, fname)
        if Drama.endswith(".txt"): #'*'은 모든 값을 의미
            Dramas.append(Drama)
for root, dirs, files in os.walk('/Users/simdaebeom/Desktop/nlp_t2m-master/data/Family'):
    for fname in files:
        Family = os.path.join(root, fname)
        if Family.endswith(".txt"): #'*'은 모든 값을 의미
            Familys.append(Family)
for root, dirs, files in os.walk('/Users/simdaebeom/Documents/GitHub/nlp_t2m/data/Fantasy'):
    for fname in files:
        Fantasy = os.path.join(root, fname)
        if Fantasy.endswith(".txt"): #'*'은 모든 값을 의미
            Fantasys.append(Fantasy)
for root, dirs, files in os.walk('/Users/simdaebeom/Desktop/nlp_t2m-master/data/Horror'):
    for fname in files:
        Horror = os.path.join(root, fname)
        if Horror.endswith(".txt"): #'*'은 모든 값을 의미
            Horrors.append(Horror)
for root, dirs, files in os.walk('/Users/simdaebeom/Desktop/nlp_t2m-master/data/Mystery'):
    for fname in files:
        Mystery = os.path.join(root, fname)
        if Mystery.endswith(".txt"): #'*'은 모든 값을 의미
            Mysterys.append(Mystery)
for root, dirs, files in os.walk('/Users/simdaebeom/Desktop/nlp_t2m-master/data/Romance'):
    for fname in files:
        Romance = os.path.join(root, fname)
        if Romance.endswith(".txt"): #'*'은 모든 값을 의미
            Romances.append(Romance)
for root, dirs, files in os.walk('/Users/simdaebeom/Desktop/nlp_t2m-master/data/Sci-Fi'):
    for fname in files:
        Sci_Fi = os.path.join(root, fname)
        if Sci_Fi.endswith(".txt"): #'*'은 모든 값을 의미
            Sci_Fis.append(Sci_Fi)
for root, dirs, files in os.walk('/Users/simdaebeom/Desktop/nlp_t2m-master/data/Thriller'):
    for fname in files:
        Thriller = os.path.join(root, fname)
        if Thriller.endswith(".txt"): #'*'은 모든 값을 의미
            Thrillers.append(Thriller)

In [4]:
for k in range(len(movies)):
    for i in range(len(movies[k])):
        with open(movies[k][i],'r') as content_file:
            content = content_file.read()
            content = preprocessing(content)
            content = " ".join(content)
        movies_content[k].append(content) 


In [5]:
# 각 장르 contents 합친다.
action_ = " ".join(action_content)
Adventure_ = " ".join(Adventure_content)
Animation_ = " ".join(Animation_content)
Comedy_ = " ".join(Comedy_content)
Crime_ = " ".join(Crime_content)
Drama_ = " ".join(Drama_content)
Family_= " ".join(Family_content)
Fantasy_ = " ".join(Fantasy_content)
Horror_ = " ".join(Horror_content)
Mystery_ = " ".join(Mystery_content)
Romance_ = " ".join(Romance_content)
Sci_Fi_ = " ".join(Sci_Fi_content)
Thriller_ = " ".join(Thriller_content)




In [8]:
def compare_movie(file_name):
    movie_content = []

    movie_content.append(action_)
    movie_content.append(Adventure_)
    movie_content.append(Animation_)
    movie_content.append(Comedy_)
    movie_content.append(Crime_)
    movie_content.append(Drama_)
    movie_content.append(Family_)
    movie_content.append(Fantasy_)
    movie_content.append(Horror_)
    movie_content.append(Mystery_)
    movie_content.append(Romance_)
    movie_content.append(Sci_Fi_)
    movie_content.append(Thriller_)

    movie_genre = ["action","Adventure","Animation","Comedy","Crime","Drama","Family","Fantasy",
                   "Horror","Mystery","Romance","Sci_Fi","Thriller"]
    with open(file_name, 'r') as content_file:
        content = content_file.read()
        content = preprocessing(content)
        content = " ".join(content)
        movie_genre.append(os.path.basename(file_name).replace(".txt",""))
    movie_content.append(content)


    vectorizer = CountVectorizer(min_df = 1, stop_words = 'english')
    dtm = vectorizer.fit_transform(movie_content)  
    
    # Fit LSA. Use algorithm = “randomized” for large datasets
    lsa = TruncatedSVD(2, algorithm = 'arpack')
    dtm_lsa = lsa.fit_transform(dtm.astype(float)) 
    dtm_lsa = Normalizer(copy=False).fit_transform(dtm_lsa)
    
    #Compute document similarity using LSA components
    similarity = np.asarray(np.asmatrix(dtm_lsa) * np.asmatrix(dtm_lsa).T)

    #pd.DataFrame(similarity,index=movie_genre, columns=movie_genre)
    Similarity_movie = np.sort(similarity[len(similarity)-1])[::-1][1:4] 

    # 비슷한 영화 장르 index
    first = np.argwhere(similarity[len(similarity)-1]==Similarity_movie[0])
    second = np.argwhere(similarity[len(similarity)-1]==Similarity_movie[1])
    third = np.argwhere(similarity[len(similarity)-1]==Similarity_movie[2])

    print(movie_genre[first[0][0]]+ "\n" + movie_genre[second[0][0]] +"\n"+ movie_genre[third[0][0]])


In [9]:
compare_movie("/Users/simdaebeom/Desktop/nlp_t2m-master/input/Thor.txt")

Adventure
action
Sci_Fi


In [10]:
compare_movie("/Users/simdaebeom/Desktop/nlp_t2m-master/input/Alien.txt")

Sci_Fi
action
Adventure


In [12]:
compare_movie("/Users/simdaebeom/Desktop/nlp_t2m-master/input/Godfather.txt")

Drama
Comedy
Romance


In [13]:
compare_movie("/Users/simdaebeom/Desktop/nlp_t2m-master/input/Hackers.txt")

Comedy
Drama
Romance


In [14]:
compare_movie("/Users/simdaebeom/Desktop/nlp_t2m-master/input/Hostage.txt")

Mystery
Family
Crime


In [15]:
compare_movie("/Users/simdaebeom/Desktop/nlp_t2m-master/input/Inglourious-Basterds.txt")

Drama
Crime
Family


In [16]:
compare_movie("/Users/simdaebeom/Desktop/nlp_t2m-master/input/Jurassic-Park-The-Lost-World.txt")

Sci_Fi
action
Adventure


In [17]:
compare_movie("/Users/simdaebeom/Desktop/nlp_t2m-master/input/Lord-of-the-Rings-The-Two-Towers.txt")

Sci_Fi
action
Adventure


In [18]:
compare_movie("/Users/simdaebeom/Desktop/nlp_t2m-master/input/Mad-Max-2-The-Road-Warrior.txt")

Sci_Fi
action
Adventure


In [19]:
compare_movie("/Users/simdaebeom/Desktop/nlp_t2m-master/input/Titanic.txt")

Sci_Fi
action
Adventure


In [20]:
compare_movie("/Users/simdaebeom/Desktop/nlp_t2m-master/input/Total-Recall.txt")

Sci_Fi
action
Adventure


In [21]:
compare_movie("/Users/simdaebeom/Desktop/nlp_t2m-master/input/War-Horse.txt")

Animation
Thriller
Mystery


In [22]:
compare_movie("/Users/simdaebeom/Desktop/nlp_t2m-master/input/Yes-Man.txt")

Romance
Comedy
Drama
