### Autotagging example from Turi:
###### https://turi.com/learn/userguide/data_matching/autotagger.html

## Trying auto tagging with bank statements

In [1]:
import graphlab as gl
transactions = gl.SFrame("balances.csv")

[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: C:\Users\MORPH_~1\AppData\Local\Temp\graphlab_server_1487535140.log.0


This non-commercial license of GraphLab Create for academic use is assigned to adedejiadeoti@gmail.com and will expire on January 15, 2018.


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[long,str,long]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


In [2]:
transactions.head()

SN,Description,Amount
1,Transportation to Lagos,5000
2,Kitchen Utensil,1000
3,Laundary,5000
4,Fuel for car,4000
5,Diesel for generator,5000
6,Cisco certifications,1000
7,Gift to mom,10000
8,Gift to dad,10000
9,Transportation to Abuja,10000
10,Fuel for car,10000


In [3]:
topics = gl.SFrame("topics.csv")

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str]
If parsing fails due to incorrect types, you can correct


the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


In [4]:
topics.head()

category
Gifts
Fuel
Transportation
Shopping
POS
Transfers
Fees
Bills
Electricity
Tithes


In [5]:
#transfers, pos, transportation, bills
goodwealth = gl.autotagger.create(topics, verbose=False)
goodwealth.summary()

Class                                : NearestNeighborAutoTagger

Schema
------
Number of examples                   : 10
Number of feature columns            : 3

Training
--------
Total training time (seconds)        : 1.0023

Accessible fields
-----------------
nearest_neighbors_model             : Model used internally to compute nearest neighbors.


In [6]:
transactions['transaction_desc'] = transactions['Description']
gw_tags = goodwealth.tag(transactions, query_name='transaction_desc', k=20, similarity_threshold=0.05,
             verbose=True)
gw_tags.print_rows(10, max_row_width=100, max_column_width=50)

+---------------------+-------------------------+----------------+----------------+
| transaction_desc_id |     transaction_desc    |    category    |     score      |
+---------------------+-------------------------+----------------+----------------+
|          0          | Transportation to Lagos | Transportation |      1.0       |
|          0          | Transportation to Lagos |   Transfers    | 0.117647058824 |
|          3          |       Fuel for car      |      Fuel      |      1.0       |
|          5          |   Cisco certifications  | Transportation | 0.166666666667 |
|          6          |       Gift to mom       |     Gifts      | 0.333333333333 |
|          7          |       Gift to dad       |     Gifts      | 0.333333333333 |
|          8          | Transportation to Abuja | Transportation |      1.0       |
|          8          | Transportation to Abuja |   Transfers    | 0.117647058824 |
|          9          |       Fuel for car      |      Fuel      |      1.0 

In [7]:
gw_tags.rename({'transaction_desc_id': 'id'})
gw_tags = gw_tags[['id', 'category']].unstack('category', new_column_name='category')


transactions = transactions.add_row_number('id')
gw_tags = gw_tags.join(transactions[['Description', 'Amount', 'id']], on='id', how='left')



In [8]:
gw_tags

id,category,Description,Amount
0,"[Transportation, Transfers] ...",Transportation to Lagos,5000
3,[Fuel],Fuel for car,4000
5,[Transportation],Cisco certifications,1000
6,[Gifts],Gift to mom,10000
7,[Gifts],Gift to dad,10000
8,"[Transportation, Transfers] ...",Transportation to Abuja,10000
9,[Fuel],Fuel for car,10000


In [9]:
import graphlab.aggregate as agg
gw_tags.groupby(key_columns='category',
                operations={'mean amount': agg.MEAN('Amount'),
                            'total amount': agg.SUM('Amount')})

category,total amount,mean amount
[Fuel],14000,7000.0
[Transportation],1000,1000.0
[Gifts],20000,10000.0
"[Transportation, Transfers] ...",15000,7500.0


In [10]:
transaction_categories = gw_tags.groupby(key_columns='category',
                                         operations={'mean amount': agg.MEAN('Amount'),
                                                     'total amount': agg.SUM('Amount')})

In [11]:
def take_first( mylist):
    return mylist[0]
transaction_categories['class'] = transaction_categories.apply(lambda x : x['category'][0])

In [12]:
transaction_categories

category,total amount,mean amount,class
[Fuel],14000,7000.0,Fuel
[Transportation],1000,1000.0,Transportation
[Gifts],20000,10000.0,Gifts
"[Transportation, Transfers] ...",15000,7500.0,Transportation


In [13]:
gl.canvas.set_target('browser')

In [14]:
transaction_categories.show(view="Bar Chart", x="class", y="total amount")

Canvas is accessible via web browser at the URL: http://localhost:54475/index.html
Opening Canvas in default web browser.
