In [432]:
###############################################################################################
# Bejön egy user adott paraméterekkel                                                         #  
# Megállíptja, hogy ezek alapján milyen csoportba tartozik                                    #
# Lekérdezi a csoportba tartozó többi usert                                                   #
# Lekérdezi az összes interakciót, amit a csoport tagjai produkáltak                          #
# Ez alapján összegyűjti az érdekes termékeket                                                #
# Az interakciók típusa alapján súlyozza és rendezi őket                                      #
# Majd ezen pontok alapján ajánl X terméket                                                   #
#                                                                                             #
# Termékhez is lehet N hasonló ajánlást kérni                                                 #
###############################################################################################

# TODO
# - Ha még nincsenek interakciók, se userek, akkor is kell valahogy ajánlani, vagy ha egy user nem sorolható semmilyen csoportba
# - Pontozásba bele venni azt, hogy a user még nem interaktálódott egy termékkel. Ezek kerüljenek előbbre.
# - Felvenni megyét is termék attribútumok közé

from __future__ import unicode_literals
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import string
import re
import nltk
from sklearn.neighbors import NearestNeighbors
from fuzzywuzzy import fuzz
from nltk.stem import RegexpStemmer
from nltk.stem.snowball import HungarianStemmer

class Text_Normalizer:
    def remove_punct(self, text):
        text_nopunct = "".join([char for char in text if char not in string.punctuation])
        return text_nopunct
    
    def remove_numbers(self, text):
        text_nonumbers = "".join([char for char in text if char not in ['0','1','2','3','4','5','6','7','8','9']])
        return text_nonumbers

    def tokenize(self, text):
        # W+ = A-Za-z0-9 vagy -
        tokens = re.split('\W+', text)
        return tokens

    def remove_stopwords(self, tokenized_list):
        stopwords = nltk.corpus.stopwords.words('hungarian')
        text = [word for word in tokenized_list if word not in stopwords]
        return text

    def stemming(self, tokenized_text):
        # ez a hun nltk
        patterns = 'i$|t$'
        stemmer = HungarianStemmer()
        #stemmer = nltk.SnowballStemmer('hungarian')
        text = [stemmer.stem(word) for word in tokenized_text]
        print(text)
        return text

    def lemmatizing(self, tokenized_text):
        wn = nltk.WordNetLemmatizer()
        text = [wn.lemmatize(word) for word in tokenized_text]
        return text
    
    def normalize(self, text):
        text_clean = self.remove_punct(text)
        text_nonumbers = self.remove_numbers(text_clean)
        text_tokenized = self.tokenize(text_nonumbers.lower())
        text_nostop = self.remove_stopwords(text_tokenized)
        
        return self.lemmatizing(text_nostop)

def get_data():
    users = pd.read_csv('users.csv')
    items = pd.read_csv('items.csv')
    interactions = pd.read_csv('interactions.csv')
    
    return users, items, interactions

def create_classifier(algo):
    from sklearn import tree
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.svm import SVC
    
    switcher = {
        'decisiontree': tree.DecisionTreeClassifier(),
        'svc': SVC(gamma='auto'),
        'knn': KNeighborsClassifier(n_neighbors=3)
    }
    
    return switcher.get(algo, None)

class User_Recommender:
    def get_variables(self, dataset):
        variables = list(set(dataset['City']))
        variables += list(['F','N'])

        return variables

    # kategorizált adatokat tartalmazó dataframe -ból ad vissza bináris matrixot
    def get_normalized_data(self, dataset, variables, remove_label=True):
        new_dataset = dataset
        for v in variables: new_dataset[v] = pd.Series([0 for _ in range(len(new_dataset))])
        columns = ['City','Sex']

        for c in columns:
            for i,row in new_dataset.iterrows():
                if pd.isnull(row[c]): continue
                if row[c] in variables: new_dataset.set_value(i, row[c], 1)

        removable = columns
        if remove_label == True and 'Label' in dataset.columns:
            removable += list(['Label'])

        if 'Id' in dataset.columns:
            removable += list(['Id'])

        return new_dataset.drop(columns=removable)

    # egy új, ismeretlen userhez ad vissza hasonló usereket
    def get_similar_users(self, user):
        classifier = create_classifier('decisiontree')

        df_users_original, _, _ = get_data()
        y = df_users_original.iloc[:,-1].values

        variables = self.get_variables(df_users_original)
        df_users = self.get_normalized_data(df_users_original, variables)
        classifier.fit(df_users.as_matrix(), y)

        df_user = pd.DataFrame([user],columns=['City','Age','Sex'])
        df_user = self.get_normalized_data(df_user, variables)

        group = classifier.predict(df_user)

        similar_users = []
        for i,u in df_users_original.iterrows():
            if u['Label'] == group[0]: similar_users.append(u)

        return similar_users

    # Termék ID listát ad vissza user interakciók alapján
    def get_item_ids_by_users(self, users):
        user_ids = []
        for u in users:
            user_ids.append(u['Id'])

        _, items, interactions = get_data()

        interactions_by_users = []
        item_ids_by_interactions = []

        for i,r in interactions.iterrows():
            if r['UserId'] in user_ids: 
                interactions_by_users.append(r)
                item_ids_by_interactions.append(r['ItemId'])

        item_ids_by_interactions = list(set(item_ids_by_interactions))
        item_ids_by_interactions.sort(key = lambda x:self.get_score(x,interactions_by_users), reverse=True)

        return item_ids_by_interactions
    
    def get_score(self, item_id, interactions):
        item_info = {'Buy':0, 'View':0}

        for inter in interactions: 
            if inter['ItemId'] == item_id: item_info[inter['Action']] += 1

        sum_interactions = len(interactions)
        score = item_info['View'] / sum_interactions
        score += (item_info['Buy'] / sum_interactions)*2

        return score

class Item_Recommender:
    def get_normalized_data(self, df):
        variables = list(set(df['City']))
        variables += list(set(df['State']))

        all_keywords = []
        for keywords in df['Keywords']:        
            for word in keywords: 
                all_keywords.append(word)

        variables += list(set(all_keywords))

        new_dataset = df
        for v in variables: new_dataset[v] = pd.Series([0 for _ in range(len(new_dataset))])
        columns = ['City', 'State', 'Keywords']

        for c in columns:
            for i,row in new_dataset.iterrows():            
                if c == 'Keywords':
                    for keyword in row[c]:
                        if keyword in variables: new_dataset.set_value(i, keyword, 1)                    
                else: 
                    if pd.isnull(row[c]): continue
                    if row[c] in variables: new_dataset.set_value(i, row[c], 1)

        removable = columns
        removable += list(['Name', 'Description'])
        return new_dataset.drop(columns=removable)
    
    def get_similar_item_ids(self, item_id, N=2):
        _, items, _ = get_data()
        self.add_keywords(items)
        normalized_items = self.get_normalized_data(items);

        X = normalized_items.as_matrix()
        nbrs = NearestNeighbors(n_neighbors=5, algorithm='auto', metric='euclidean').fit(X)

        xtest = None
        for i,r in normalized_items.iterrows():
            if r['Id'] == item_id: xtest = r

        xtest = xtest.as_matrix()
        xtest = xtest.reshape(1, -1)

        distances, indices = nbrs.kneighbors(xtest)
        distances = distances[0][1:]

        item = None
        for i,r in items.iterrows():
            if r['Id'] == item_id: item = r

        neighbors_tmp = items.iloc[indices[0][1:]]
        neighbors = neighbors_tmp.values.tolist()
        for i, val in enumerate(neighbors):
            neighbors[i].append(distances[i])

        neighbors.sort(key = lambda x:self.get_score(item[1], item[3], item[4], x[1], x[3], x[4], x[-1]), reverse=True)

        return list(map(lambda x:x[0],neighbors))[:N]
    
    def get_score(self, main_name, main_city, main_state, name, city, state, distance):
        # TODO éleseben itt lehet majd távolságot / megyét vizsgálni
        if main_city == city:
            location_factor = 3
        else:
            location_factor = 1
            
        # Kerdes, hogy melyik fontosabb. A termek tartalma, vagy a lokacio
        # Pl ha megvett egy szombathelyi muzeum, egyeb kulturalis jellegu kartyat
        # akkor melyik kapjon nagyobb pontot: egy pesti muzem, kulturalis jellegu kartya, vagy egy sarvar furdo
        if main_state == state:
            location_factor *= 1.9

        if fuzz.ratio(main_name,name) > 50 or fuzz.token_set_ratio(main_name,name) > 50:
            name_factor = 2
        else:
            name_factor = 1

        score = location_factor * name_factor 
        if distance != 0.0: score = score / distance
            
        return score
    
    def add_keywords(self, df):
        # TODO 
        #   - keresni valamilyen hun-nltk packaget, mert sok töltelék, és ragozott formában lévő szó marad meg
        
        text_normalizer = Text_Normalizer()
        df['Keywords'] = (df['Name']+" "+df['Description']).apply(lambda x: text_normalizer.normalize(x))
        
    def recommend_by_item(self, item_id, N=2):
        ids = self.get_similar_item_ids(item_id, N)
        _, items, _ = get_data()
        
        recommended_items = []
        for idx in ids:
            for i,r in items.iterrows():
                if r['Id'] == idx: recommended_items.append(r)
                
        return recommended_items
    
    def recommend_by_user(self, user_recommender, user, N=2):
        similar_users = user_recommender.get_similar_users(user)
        item_ids = user_recommender.get_item_ids_by_users(similar_users)

        _, items, _ = get_data()

        recommended_items = []
        for idx in item_ids:
            for i,r in items.iterrows():
                if r['Id'] == idx: recommended_items.append(r)

        return recommended_items[0:N]
    
user_recommender = User_Recommender()
item_recommender = Item_Recommender()

new_user = ['Budapest',23,'F']
items_by_user = item_recommender.recommend_by_user(user_recommender,new_user,N=2)
#print(items_by_user)


items_by_item = item_recommender.recommend_by_item(5,N=2)
print(items_by_item)


[Id                                       4
Name           Szombathely karnevál kártya
Description               Savaria karnevál
City                           Szombathely
State                                  Vas
Name: 3, dtype: object, Id                                   6
Name                      Sárvár fürdő
Description    sárvári termál kristály
City                            Sárvár
State                              Vas
Name: 5, dtype: object]


