In [1]:
import pymorphy2
import pandas as pd

morph = pymorphy2.MorphAnalyzer() 

In [2]:
def get_clear_data(name):
    df = pd.read_csv(name, names=['id', 'age', 'order', 'big', 'small', 'gender', 'Unnamed'])

    df = df.drop(columns=df.columns[6])

    df = df.drop(columns=df.columns[0])
    
    df = df.drop(df[df['big'].isnull() & df['small'].isnull()].index)
    
    df['id_person'] = df.index
    
    return df


def split_lower(text):
    try:
        return text.split(',')
    except:
        return text


def lemmatize(words):
    try:
        return [morph.parse(word.strip(' ,.'))[0].normal_form for word in words]
    except: 
        return words


def standartize_gender(gender):
    try: 
        if gender.lower()[0] in ['ж', 'f']:
            return 'ж'
        elif gender.lower()[0] in ['м', 'm']:
            return 'м'
        else:
            return '-'
    except:
        return '-'
    

def standartize_order(order):
    if order == 'small':
        return 's-b'
    else:
        return 'b-s'
    

def normalize(df):
    df.big = df.big.apply(split_lower)
    df.big = df.big.apply(lemmatize)
    
    df.small = df.small.apply(split_lower)
    df.small = df.small.apply(lemmatize)
    
    df.gender = df.gender.apply(standartize_gender)
    df.order = df.order.apply(standartize_order)
    
    return df

    
def explode(df, column):
    if column == 'big':
        df = df.drop(columns='small')
        return (df.explode(column))
    elif column == 'small':
        df = df.drop(columns='big')
        return (df.explode(column))

    
def add_pos(df):
    pos_l = []
    last= -1
    for line in df.iterrows():
        if line[-1][-1]!= last:
            pos = 0
            pos_l.append(pos)
            last = line[-1][-1]
        else:
            pos = pos + 1
            pos_l.append(pos)
    df['pos'] = pos_l       
    return df
            

In [3]:
def main(filename):
    
    df = get_clear_data(filename)
    
    df = normalize(df)
    
    df_small = add_pos(explode(df, 'small'))
    df_small.columns = ['age', 'order', 'lemma', 'gender', 'id_person', 'pos']
    df_small['type'] = 'small'
    
    df_big = add_pos(explode(df, 'big'))
    df_big.columns = ['age', 'order', 'lemma', 'gender', 'id_person', 'pos']
    df_big['type'] = 'big'
    
    df = pd.concat([df_small, df_big],  names = ['age', 'order', 'lemma', 'gender', 'id_person', 'pos', 'type'], ignore_index=True)
    df = df.drop(df[df['lemma'] == ''].index)
    df = df.dropna()
    
    return df


In [4]:
df = main("+ constraint.csv")  #имя файла
df.head()

Unnamed: 0,age,order,lemma,gender,id_person,pos,type
0,21,s-b,бутылка,ж,0,0,small
1,21,s-b,чашка,ж,0,1,small
2,21,s-b,банка,ж,0,2,small
3,21,s-b,ручка,ж,0,3,small
4,21,s-b,карандаш,ж,0,4,small


In [5]:
# all_big = df[df['type'] == 'big']['lemma'].value_counts()
# all_big.to_csv('-all_big.csv', index=True)

# all_small = df[df['type'] == 'small']['lemma'].value_counts()
# all_small.to_csv('-all_small.csv', index=True)

# small_s_b = df[(df['type'] == 'small') & (df['order'] == 's-b') & (df['gender'] == 'м')]['lemma'].value_counts()
# small_s_b.to_csv('-small_s_b.csv', index=True)

# small_b_s = df[(df['type'] == 'small') & (df['order'] == 'b-s')]['lemma'].value_counts()
# small_b_s.to_csv('-small_b_s.csv', index=True)

# big_s_b = df[(df['type'] == 'big') & (df['order'] == 's-b')]['lemma'].value_counts()
# big_s_b.to_csv('-big_s_b.csv', index=True)

# big_b_s = df[(df['type'] == 'big') & (df['order'] == 'b-s')]['lemma'].value_counts()
# big_b_s.to_csv('-big_b_s.csv', index=True)

In [6]:
NAME = df[(df['type'] == 'small') & (df['order'] == 'b-s') & (df['age'] > 40)]['lemma'].value_counts()
NAME.to_csv('NAME.csv', index=True)