In [5]:
using Random
Random.seed!(13)

using CSV
using Plots
using DataFrames
using Statistics
using Missings
using Distributions
using StatsPlots
using LowRankModels, LinearAlgebra

In [6]:
include("proxgrad.jl")
df = CSV.read("airline.csv")

Unnamed: 0_level_0,airline_name,link,title,author
Unnamed: 0_level_1,String,String,String,String
1,adria-airways,/airline-reviews/adria-airways,Adria Airways customer review,D Ito
2,adria-airways,/airline-reviews/adria-airways,Adria Airways customer review,Ron Kuhlmann
3,adria-airways,/airline-reviews/adria-airways,Adria Airways customer review,E Albin
4,adria-airways,/airline-reviews/adria-airways,Adria Airways customer review,Tercon Bojan
5,adria-airways,/airline-reviews/adria-airways,Adria Airways customer review,L James
6,adria-airways,/airline-reviews/adria-airways,Adria Airways customer review,Dzano Edin
7,adria-airways,/airline-reviews/adria-airways,Adria Airways customer review,M Tushishvili
8,adria-airways,/airline-reviews/adria-airways,Adria Airways customer review,M Znidar
9,adria-airways,/airline-reviews/adria-airways,Adria Airways customer review,Tercon Bojan
10,adria-airways,/airline-reviews/adria-airways,Adria Airways customer review,M Znidar


In [7]:
# Organize and display features along with variable type
feature_names = names(df)
for i in 1:20
    println(string(i), "\t", string(feature_names[i]), "\t\t\t", string(eltype(df[!, i])))
end

1	airline_name			String
2	link			String
3	title			String
4	author			String
5	author_country			Union{Missing, String}
6	date			Dates.Date
7	content			String
8	aircraft			Union{Missing, String}
9	type_traveller			Union{Missing, String}
10	cabin_flown			Union{Missing, String}
11	route			Union{Missing, String}
12	overall_rating			Union{Missing, Float64}
13	seat_comfort_rating			Union{Missing, Float64}
14	cabin_staff_rating			Union{Missing, Float64}
15	food_beverages_rating			Union{Missing, Float64}
16	inflight_entertainment_rating			Union{Missing, Float64}
17	ground_service_rating			Union{Missing, Float64}
18	wifi_connectivity_rating			Union{Missing, Float64}
19	value_money_rating			Union{Missing, Float64}
20	recommended			Int64


In [8]:
# df = df[1:1000,:]

df_OR = df[.!(ismissing.(df[!, :content])), :];
df_content = df_OR[:,:content]

41396-element Array{String,1}:
 "Outbound flight FRA/PRN A319. 2 hours 10 min flight. I thought drinks/snacks for sale but sandwich soft drinks were served complimentary. Inbound flights SKP/LJU/FRA CRJ900. each 1 hour 30 min flight. Skyshop menu was in a seat pocket and drinks/snacks were for sale. All flight crews were friendly. Security check at the Ljubljana airport for transit passengers was chaos however it's possible to go to a gate within 30min."                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              

In [9]:
# Create wordlist with all words in text reviews
wordlist = String[]
for line in df_content
    words = split(line, r"\W")
    map(w -> push!(wordlist, lowercase(w)), words)
end
filter!(!isempty, wordlist)

4923786-element Array{String,1}:
 "outbound"   
 "flight"     
 "fra"        
 "prn"        
 "a319"       
 "2"          
 "hours"      
 "10"         
 "min"        
 "flight"     
 "i"          
 "thought"    
 "drinks"     
 ⋮            
 "at"         
 "the"        
 "gate"       
 "no"         
 "explanation"
 "offered"    
 "no"         
 "drinks"     
 "available"  
 "on"         
 "the"        
 "flight"     

In [10]:
# Create dictionary that counts number of occurances for each word
wordcounts = Dict{String,Int64}()
for word in wordlist
    wordcounts[word]=get(wordcounts, word, 0) + 1
end

# Sort to find most common words
common_words = sort(collect(wordcounts), by = tuple -> last(tuple), rev=true)

36721-element Array{Pair{String,Int64},1}:
           "the" => 231613
           "and" => 154486
            "to" => 151789
           "was" => 107931
             "a" => 98839 
             "i" => 81266 
            "on" => 78689 
            "in" => 77581 
        "flight" => 69612 
            "of" => 62870 
           "for" => 54457 
          "with" => 49020 
          "were" => 48286 
                 ⋮        
          "0610" => 1     
         "ey431" => 1     
        "nagged" => 1     
       "roaches" => 1     
 "canelécookies" => 1     
      "smuggled" => 1     
  "germandwings" => 1     
         "ul318" => 1     
          "weez" => 1     
       "leavers" => 1     
          "3276" => 1     
       "chárter" => 1     

In [11]:
num_lines = 500

for i in 1:1:num_lines
    println(common_words[i])
end

"the" => 231613
"and" => 154486
"to" => 151789
"was" => 107931
"a" => 98839
"i" => 81266
"on" => 78689
"in" => 77581
"flight" => 69612
"of" => 62870
"for" => 54457
"with" => 49020
"were" => 48286
"we" => 43485
"not" => 36874
"is" => 34762
"it" => 33726
"but" => 33457
"at" => 33424
"that" => 32117
"from" => 31275
"they" => 28756
"had" => 28123
"very" => 27921
"my" => 27521
"no" => 25476
"as" => 25204
"this" => 24874
"have" => 24100
"service" => 23420
"time" => 22263
"good" => 21046
"food" => 20408
"seats" => 19629
"you" => 19433
"all" => 18573
"be" => 18218
"t" => 17516
"seat" => 17094
"an" => 16866
"are" => 16808
"flights" => 16656
"crew" => 16239
"staff" => 15878
"our" => 15854
"so" => 15781
"would" => 14457
"there" => 14382
"only" => 14134
"plane" => 13697
"class" => 13083
"one" => 13023
"cabin" => 12923
"return" => 12760
"which" => 12663
"again" => 12597
"airline" => 12549
"check" => 12093
"us" => 11887
"or" => 11875
"when" => 11835
"by" => 11625
"me" => 11455
"their" => 11414
"hour

In [12]:
"good" "first" "entertainment" "friendly" "new" "comfortable" "great" "delayed" "excellent" "nice" "late" "better" "old" "offered"
"free" "delay" "helpful" "early" "pleasant" "best" "available" 

efficient, recommend, uncomfortable, cancelled, comfort, problems, delays, UNITED, cheap, awful

LoadError: syntax: extra token """ after end of expression

In [30]:
bag_of_words = ["good",
                "first",
                "entertainment",
                "friendly",
                "new",
                "comfortable",
                "great",
                "delayed",
                "excellent",
                "nice",
                "late",
                "better",
                "offered",
                "free",
                "delay",
                "helpful",
                "early",
                "pleasant",
                "best",
                "available",
                "efficient",
                "recommend",
                "uncomfortable",
                "cancelled",
                "comfort",
                "problems",
                "delays",
                "united",
                "cheap",
                "awful"
                ]

"Computes a onehot vector for every entry in column given a bag of words"
function onehot_col(column, words=bag_of_words)
    result = zeros(size(column,1), size(words,1))
    for i = 1:size(column,1)
        for j in 1:size(words,1)
            if occursin(words[j],column[i])
                result[i,j] = 1
            end
        end
    end
    result
end

onehot_col

In [29]:
onehot_col(df_content)

2×11 Array{Float64,2}:
 0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 1.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  1.0  0.0