# Move on to next jupyter notebook for modeling

In [52]:
import glob, os    
import pandas as pd

df_CA = pd.read_csv('csv2/combined/CA.csv')
cols = ['user_name', 'trail', 'trail_url', 'stars', 'date', 'review_text', 
       'trail_avg_star','difficulty', 'distance', 'elevation', 'route_type',  'location','tag_list',
       'short_description']
df_CA=df_CA[cols]
df_CA.shape

(579782, 14)

## Convert 'stars' column from string to int value

In [53]:
def convert_stars_to_int(x):
    star = x.split(' ')[0]
    try:
        int(star)
        rating = int(star)
    except:
        rating = None
    return rating

In [54]:
df_CA['stars']= df_CA['stars'].apply(convert_stars_to_int)
df_CA['stars'].unique()

array([ 5.,  4.,  3., nan,  0.,  2.,  1., -1.])

In [55]:
## Remove odd star ratings (0, -1) after checking the review text. They seem to have no review_text indicating that it is likely an error
df_CA = df_CA[df_CA['stars'].isin([1,2,3,4,5])]

## Convert date to datetime format

In [56]:
df_CA['date']= pd.to_datetime(df_CA['date'], infer_datetime_format=True)

In [57]:
df_CA.head()

Unnamed: 0,user_name,trail,trail_url,stars,date,review_text,trail_avg_star,difficulty,distance,elevation,route_type,location,tag_list,short_description
0,Melody Romero,Tuolumne Meadows to Glen Aulin High Sierra Camp,www.alltrails.com/trail/us/california/tuolumne...,5.0,2020-10-22,,4.5,moderate,12.1 mi,1200 ft,Out & back,Yosemite National Park,"['Backpacking', 'Camping', 'Fishing', 'Hiking'...",Tuolumne Meadows to Glen Aulin High Sierra Cam...
1,Gretchen Cremo,Tuolumne Meadows to Glen Aulin High Sierra Camp,www.alltrails.com/trail/us/california/tuolumne...,5.0,2020-10-20,"Great hike, really enjoyed the Tuolumne River,...",4.5,moderate,12.1 mi,1200 ft,Out & back,Yosemite National Park,"['Backpacking', 'Camping', 'Fishing', 'Hiking'...",Tuolumne Meadows to Glen Aulin High Sierra Cam...
2,Yemia Hashimoto,Tuolumne Meadows to Glen Aulin High Sierra Camp,www.alltrails.com/trail/us/california/tuolumne...,5.0,2020-10-09,"Oct9 hiked in, Oct 11 hiked out. We were a gro...",4.5,moderate,12.1 mi,1200 ft,Out & back,Yosemite National Park,"['Backpacking', 'Camping', 'Fishing', 'Hiking'...",Tuolumne Meadows to Glen Aulin High Sierra Cam...
3,Jose Ballesteros,Tuolumne Meadows to Glen Aulin High Sierra Camp,www.alltrails.com/trail/us/california/tuolumne...,5.0,2020-10-09,absolutely worth every step. didnt see anyone...,4.5,moderate,12.1 mi,1200 ft,Out & back,Yosemite National Park,"['Backpacking', 'Camping', 'Fishing', 'Hiking'...",Tuolumne Meadows to Glen Aulin High Sierra Cam...
4,Briana Wharton,Tuolumne Meadows to Glen Aulin High Sierra Camp,www.alltrails.com/trail/us/california/tuolumne...,5.0,2020-08-29,Awesome trail that was partially shaded throug...,4.5,moderate,12.1 mi,1200 ft,Out & back,Yosemite National Park,"['Backpacking', 'Camping', 'Fishing', 'Hiking'...",Tuolumne Meadows to Glen Aulin High Sierra Cam...


## Unique usernames

In [59]:
df_CA.groupby('user_name').count().sort_values('stars', ascending=False)
df_CA = df_CA[df_CA['user_name']!= 'AllTrails User']
df_CA = df_CA[df_CA['user_name']!= 'AllTrails User ']

## For user & trail combination that was reviewed more than once, we will take average value of the star rating

In [61]:
df = df_CA.drop_duplicates(subset=['user_name', 'trail_url'], keep='first', ignore_index=True)
df['user_id'] = df['user_name'].astype('category').cat.codes
df.to_csv('csv2/combined/CA_review_cleaned.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['user_id'] = df['user_name'].astype('category').cat.codes


In [62]:
df.head()

Unnamed: 0,user_name,trail,trail_url,stars,date,review_text,trail_avg_star,difficulty,distance,elevation,route_type,location,tag_list,short_description,user_id
0,Melody Romero,Tuolumne Meadows to Glen Aulin High Sierra Camp,www.alltrails.com/trail/us/california/tuolumne...,5.0,2020-10-22,,4.5,moderate,12.1 mi,1200 ft,Out & back,Yosemite National Park,"['Backpacking', 'Camping', 'Fishing', 'Hiking'...",Tuolumne Meadows to Glen Aulin High Sierra Cam...,126924
1,Gretchen Cremo,Tuolumne Meadows to Glen Aulin High Sierra Camp,www.alltrails.com/trail/us/california/tuolumne...,5.0,2020-10-20,"Great hike, really enjoyed the Tuolumne River,...",4.5,moderate,12.1 mi,1200 ft,Out & back,Yosemite National Park,"['Backpacking', 'Camping', 'Fishing', 'Hiking'...",Tuolumne Meadows to Glen Aulin High Sierra Cam...,66692
2,Yemia Hashimoto,Tuolumne Meadows to Glen Aulin High Sierra Camp,www.alltrails.com/trail/us/california/tuolumne...,5.0,2020-10-09,"Oct9 hiked in, Oct 11 hiked out. We were a gro...",4.5,moderate,12.1 mi,1200 ft,Out & back,Yosemite National Park,"['Backpacking', 'Camping', 'Fishing', 'Hiking'...",Tuolumne Meadows to Glen Aulin High Sierra Cam...,186446
3,Jose Ballesteros,Tuolumne Meadows to Glen Aulin High Sierra Camp,www.alltrails.com/trail/us/california/tuolumne...,5.0,2020-10-09,absolutely worth every step. didnt see anyone...,4.5,moderate,12.1 mi,1200 ft,Out & back,Yosemite National Park,"['Backpacking', 'Camping', 'Fishing', 'Hiking'...",Tuolumne Meadows to Glen Aulin High Sierra Cam...,91120
4,Briana Wharton,Tuolumne Meadows to Glen Aulin High Sierra Camp,www.alltrails.com/trail/us/california/tuolumne...,5.0,2020-08-29,Awesome trail that was partially shaded throug...,4.5,moderate,12.1 mi,1200 ft,Out & back,Yosemite National Park,"['Backpacking', 'Camping', 'Fishing', 'Hiking'...",Tuolumne Meadows to Glen Aulin High Sierra Cam...,25826
